diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java index 9abc2059b6a94..e5f673d630690 100644 --- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java +++ b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java @@ -22,15 +22,14 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ - package build.tools.generatecharacter; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.util.Arrays; import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.stream.IntStream; public class CaseFolding { @@ -42,32 +41,58 @@ public static void main(String[] args) throws Throwable { var templateFile = Paths.get(args[0]); var caseFoldingTxt = Paths.get(args[1]); var genSrcFile = Paths.get(args[2]); - var supportedTypes = "^.*; [CTS]; .*$"; + + // java.lang + var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding var caseFoldingEntries = Files.lines(caseFoldingTxt) - .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) - .map(line -> { - String[] cols = line.split("; "); - return new String[] {cols[0], cols[1], cols[2]}; - }) - .filter(cols -> { - // the folding case doesn't map back to the original char. - var cp1 = Integer.parseInt(cols[0], 16); - var cp2 = Integer.parseInt(cols[2], 16); - return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; - }) - .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) - .collect(Collectors.joining(",\n", "", "")); + .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + var foldingChars = Arrays.stream(folding) + .mapToObj(Character::toChars) + .flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int) chars[i])) + .toArray(); + return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)", + cp, + Arrays.stream(foldingChars) + .mapToObj(c -> String.format("0x%04x", c)) + .collect(Collectors.joining(", ", "new char[] {", "}")) + ); + }) + .collect(Collectors.joining(",\n", "", "")); + // util.regex + var expandedSupportedTypes = "^.*; [CTS]; .*$"; + var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes)) + .map(line -> { + String[] cols = line.split("; "); + return new String[]{cols[0], cols[1], cols[2]}; + }) + .filter(cols -> { + // the folding case doesn't map back to the original char. + var cp1 = Integer.parseInt(cols[0], 16); + var cp2 = Integer.parseInt(cols[2], 16); + return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; + }) + .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) + .collect(Collectors.joining(",\n", "", "")); // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. // 0049; T; 0131; # LATIN CAPITAL LETTER I final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); - // Generate .java file Files.write( - genSrcFile, - Files.lines(templateFile) - .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) - .collect(Collectors.toList()), - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + genSrcFile, + Files.lines(templateFile) + .map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line) + .map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line) + .collect(Collectors.toList()), + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } } diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk index c05b126299b9e..d2f9f55ca393f 100644 --- a/make/modules/java.base/gensrc/GensrcCharacterData.gmk +++ b/make/modules/java.base/gensrc/GensrcCharacterData.gmk @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA) ################################################################################ + +GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java + +STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template +CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt + +$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT) + $(call LogInfo, Generating $@) + $(call MakeTargetDir) + $(TOOL_GENERATECASEFOLDING) \ + $(STRINGCASEFOLDING_TEMPLATE) \ + $(CASEFOLDINGTXT) \ + $(GENSRC_STRINGCASEFOLDING) + +TARGETS += $(GENSRC_STRINGCASEFOLDING) + + endif # include guard include MakeIncludeEnd.gmk diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk index a30f22b34d4bf..c46a029e2c255 100644 --- a/make/modules/java.base/gensrc/GensrcRegex.gmk +++ b/make/modules/java.base/gensrc/GensrcRegex.gmk @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK) ################################################################################ -GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java - -CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template -CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt - -$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT) - $(call LogInfo, Generating $@) - $(call MakeTargetDir) - $(TOOL_GENERATECASEFOLDING) \ - $(CASEFOLDINGTEMP) \ - $(CASEFOLDINGTXT) \ - $(GENSRC_CASEFOLDING) - -TARGETS += $(GENSRC_CASEFOLDING) - -################################################################################ - endif # include guard include MakeIncludeEnd.gmk diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index a18ac3250dc86..1387ff8559e7c 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -2189,6 +2189,56 @@ public boolean equalsIgnoreCase(String anotherString) { && regionMatches(true, 0, anotherString, 0, length()); } + /** + * Compares this {@code String} to another {@code String} for equality, + * using Unicode case folding. Two strings are considered equal + * by this method if their case-folded forms are identical. + *
+ * Case folding is defined by the Unicode Standard in + * CaseFolding.txt, + * including 1:M mappings. For example, {@code "Maße".equalsFoldCase("MASSE")} + * returns {@code true}, since the character {@code U+00DF} (sharp s) folds + * to {@code "ss"}. + *
+ * Case folding is locale-independent and language-neutral, unlike + * locale-sensitive transformations such as {@link #toLowerCase()} or + * {@link #toUpperCase()}. It is intended for caseless matching, + * searching, and indexing. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #equalsIgnoreCase(String)}. It implements full case folding as + * defined by the Unicode Standard, which may differ from the simpler + * per-character mapping performed by {@code equalsIgnoreCase}. + * For example: + *
{@snippet lang=java :
+ * String a = "Maße";
+ * String b = "MASSE";
+ * boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
+ * boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
+ * }
+ *
+ * @param anotherString
+ * The {@code String} to compare this {@code String} against
+ *
+ * @return {@code true} if the given object is not {@code null} and represents
+ * the same sequence of characters as this string under Unicode case
+ * folding; {@code false} otherwise.
+ *
+ * @see #compareToFoldCase(String)
+ * @see #equalsIgnoreCase(String)
+ * @since 26
+ */
+ public boolean equalsFoldCase(String anotherString) {
+ if (this == anotherString) {
+ return true;
+ }
+ if (anotherString == null) {
+ return false;
+ }
+ return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
+ }
+
/**
* Compares two strings lexicographically.
* The comparison is based on the Unicode value of each character in
@@ -2310,6 +2360,76 @@ public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}
+ /**
+ * A Comparator that orders {@code String} objects as by
+ * {@link #compareToFoldCase(String) compareToFoldCase()}.
+ *
+ * @see #compareToFoldCase(String)
+ * @since 26
+ */
+ public static final Comparator+ * Case folding is a locale-independent, language-neutral form of case mapping, + * primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)}, + * which applies a simpler locale-insensitive uppercase mapping. This method + * follows the Unicode full case folding, providing stable and + * consistent results across all environments. + *
+ * Note that this method does not take locale into account, and may + * produce results that differ from locale-sensitive ordering. Use + * {@link java.text.Collator} for locale-sensitive comparison. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #compareToIgnoreCase(String)}. It implements the full case folding + * as defined by the Unicode Standard, which may differ from the simpler + * per-character mapping performed by {@code compareToIgnoreCase}. + * For example: + *
{@snippet lang=java :
+ * String a = "Maße";
+ * String b = "MASSE";
+ * int cmpFoldCase = a.compareToFoldCase(b); // returns 0
+ * int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
+ * }
+ *
+ * @param str the {@code String} to be compared.
+ * @return a negative integer, zero, or a positive integer as the specified
+ * String is greater than, equal to, or less than this String,
+ * ignoring case considerations by case folding.
+ * @see #equalsFoldCase(String)
+ * @see #compareToIgnoreCase(String)
+ * @see java.text.Collator
+ * @since 26
+ */
+ public int compareToFoldCase(String str) {
+ return UNICODE_CASEFOLD_ORDER.compare(this, str);
+ }
+
/**
* Tests if two string regions are equal.
* diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java index 61c62d049bcf1..8e9085ef209bc 100644 --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -32,6 +32,8 @@ import java.util.function.IntConsumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; + +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.IntrinsicCandidate; @@ -62,6 +64,10 @@ static int length(byte[] value) { return value.length; } + static int codePointAt(byte[] value, int index, int end) { + return value[index] & 0xff; + } + static char[] toChars(byte[] value) { char[] dst = new char[value.length]; inflate(value, 0, dst, 0, value.length); @@ -179,6 +185,100 @@ static int compareToCI_UTF16(byte[] value, byte[] other) { return len1 - len2; } + public static int compareToFC(byte[] value, byte[] other) { + int len1 = value.length; + int len2 = other.length; + char[] folded1 = null; + char[] folded2 = null; + int k1 = 0, k2 = 0, fk1 = 0, fk2 = 0; + while ((k1 < len1 || folded1 != null && fk1 < folded1.length) && + (k2 < len2 || folded2 != null && fk2 < folded2.length)) { + char c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + int cp = codePointAt(value, k1++, len1); // no surrogate + folded1 = CaseFolding.foldIfDefined(cp); + fk1 = 0; + if (folded1 == null) { + c1 = (char)cp; + } else { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + int cp = codePointAt(other, k2++, len2); + folded2 = CaseFolding.foldIfDefined(cp); + fk2 = 0; + if (folded2 == null) { + c2 = (char)cp; + } else { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < len1 || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < len2 || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + public static int compareToFC_UTF16(byte[] value, byte[] other) { + int len1 = value.length; + int len2 = StringUTF16.length(other); + char[] folded1 = null; + char[] folded2 = null; + int k1 = 0, k2 = 0, fk1 = 0, fk2 = 0; + + while ((k1 < len1 || folded1 != null && fk1 < folded1.length) && + (k2 < len2 || folded2 != null && fk2 < folded2.length)) { + char c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + int cp = codePointAt(value, k1++, len1); + folded1 = CaseFolding.foldIfDefined(cp); + fk1 = 0; + if (folded1 == null) { + c1 = (char)cp; + } else { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + int cp = StringUTF16.codePointAt(other, k2, len2); + k2 += Character.charCount(cp); + folded2 = CaseFolding.foldIfDefined(cp); + fk2 = 0; + if (folded2 == null) { + c2 = (char)cp; + } else { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < len1 || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < len2 || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0); } diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java index 4e31c9728e9cd..74634d869fc1e 100644 --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -34,6 +34,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.misc.Unsafe; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.ForceInline; @@ -592,6 +593,62 @@ static int compareToCI_Latin1(byte[] value, byte[] other) { return -StringLatin1.compareToCI_UTF16(other, value); } + public static int compareToFC_Latin1(byte[] value, byte[] other) { + return -StringLatin1.compareToFC_UTF16(other, value); + } + + public static int compareToFC(byte[] value, byte[] other) { + int len1 = length(value); + int len2 = length(other); + char[] folded1 = null; + char[] folded2 = null; + int k1 = 0, k2 = 0, fk1 = 0, fk2 = 0; + while ((k1 < len1 || folded1 != null && fk1 < folded1.length) && + (k2 < len2 || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = Character.codePointAt(folded1, fk1); + fk1 += Character.charCount(c1); + } else { + int cp = codePointAt(value, k1, len1, true); + k1 += Character.charCount(cp); + folded1 = CaseFolding.foldIfDefined(cp); + fk1 = 0; + if (folded1 == null) { + c1 = cp; + } else { + c1 = Character.codePointAt(folded1, 0); + fk1 += Character.charCount(c1); + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = Character.codePointAt(folded2, fk2); + fk2 += Character.charCount(c2); + } else { + int cp = codePointAt(other, k2, len2, true); + k2 += Character.charCount(cp); + folded2 = CaseFolding.foldIfDefined(cp); + fk2 = 0; + if (folded2 == null) { + c2 = cp; + } else { + c2 = Character.codePointAt(folded2, 0); + fk2 += Character.charCount(c2); + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < len1 || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < len2 || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0); } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 2908370acd551..8e50b66ca86e6 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -43,8 +43,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; -import jdk.internal.util.regex.CaseFolding; import jdk.internal.util.regex.Grapheme; /** diff --git a/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template new file mode 100644 index 0000000000000..160536ce6a2e7 --- /dev/null +++ b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.java.lang; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static java.util.Map.entry; + +/** + * Utility class for {@code String.toCaseFold()} that handles Unicode case folding + * properties defined in CasingFolding.txt, including 1:M full case folding. + */ + public final class CaseFolding { + + private CaseFolding() {} + + /** + * Tests whether the specified code point is already in its case-folded form. + *
+ * A code point is considered folded if it does not have an explicit case + * folding mapping in the Unicode CaseFolding data. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has no case + * folding mapping (that is, it is already folded); + * {@code false} otherwise + * + * @see #fold(int) + */ + public static boolean isFolded(int cp) { + return caseFoldingMap.get(cp) == null; + } + + /** + * Returns the case-folded form of the specified code point, according + * to the Unicode case folding mappings. + *
+ * If the code point has no case folding mapping, this method returns + * the original code point as a single-element array. Otherwise, it + * returns the mapped form, which may consist of one or more {@code char} + * values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point + * + * @see #isFolded(int) + */ + public static char[] fold(int cp) { + var entry = caseFoldingMap.get(cp); + if (entry != null) + return entry.folding; + return Character.toChars(cp); + } + + /** + * Returns the case-folded form of the specified code point defined + * by the Unicode case folding mappings. + *
+ * If the code point has no case folding mapping defined, this method + * returns null. Otherwise, it returns the mapped form, which may consist + * of one or more {@code char} values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point, null if + * there is no mapping defined. + */ + public static char[] foldIfDefined(int cp) { + var entry = caseFoldingMap.get(cp); + return entry != null ? entry.folding : null; + } + + /** + * Returns a case-folded copy of the given {@code String} object, using the + * Unicode case folding mappings defined in + * + * Unicode Case Folding Properties. + *
+ * This is a convenience method intended primarily for testing + * {@link #isFolded(int)} and {@link #fold(int)}. Its implementation is + * not optimized for performance and should not be used in performance- + * sensitive contexts. It exists only until a dedicated + * {@code String.toCaseFold()} method is introduced. + * + * @param s + * the input string + * @return a {@code String} containing the case-folded form of the input string + */ + public static String fold(String s) { + int first; + int len = s.length(); + int cpCnt = 1; + for (first = 0; first < len; first += cpCnt) { + int cp = s.codePointAt(first); + if (!CaseFolding.isFolded(cp)) { + break; + } + cpCnt = Character.charCount(cp); + } + if (first == len) { + return s; + } + StringBuilder sb = new StringBuilder(len); + sb.append(s, 0, first); + for (int i = first; i < len; i += cpCnt) { + int cp = s.codePointAt(i); + if (CaseFolding.isFolded(cp)) { + sb.appendCodePoint(cp); + } else { + char[] folded = CaseFolding.fold(cp); + sb.append(folded); + } + cpCnt = Character.charCount(cp); + } + return sb.toString(); + } + + /** + * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive + * matching, according to the + * Simple Loose Matches + * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. + *
+ * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must + * be applied to literals and (optionally) to character classes. When applied to character classes, each + * character class is expected to be closed under simple case folding. See the standard for the + * detailed explanation and example of "closed". + *
+ * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should + *
+ * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: + * back-refs, string slice (sequences), single, family(char-property) and class range. Single and + * family may appears independently or within a class. + *
+ * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and + * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for + * matching. + *
+ * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, + * if their behavior is clearly specified. + *
+ * This method addresses that requirement for the "range" construct within in character class by computing + * the additional characters that should be included to close the range under simple case folding: + *
+ * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple + * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped + * character is not already in the range, then that mapped character (typically lowercase) is added to + * the expansion set. + *
+ * This allows regex character class "range" implementation to use the returned expansion set to support + * additional case-insensitive matching, without duplicating characters already covered by the existing + * regex range implementation. The expectation is the matching is done using both the uppercase and + * lowercase forms of the input character, for example + * + *
{@code
+ *
+ * ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
+ * inRange(lower, Character.toLower(ch), upper) ||
+ * additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
+ * additionalClosingCharacters.contains(Character.toUpperCase(ch))
+ * }
+ *
+ *
+ * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
+ * @param start the starting code point of the character range
+ * @param end the ending code point of the character range
+ * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
+ * those already in the range
+ */
+ public static int[] getClassRangeClosingCharacters(int start, int end) {
+ int[] expanded = new int[expanded_case_cps.length];
+ int off = 0;
+ for (int cp : expanded_case_cps) {
+ if (cp >= start && cp <= end) {
+ int folding = expanded_case_map.get(cp);
+ if (folding < start || folding > end) {
+ expanded[off++] = folding;
+ }
+ }
+ }
+ return Arrays.copyOf(expanded, off);
+ }
+
+ private static final Map
- * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
- * be applied to literals and (optionally) to character classes. When applied to character classes, each
- * character class is expected to be closed under simple case folding. See the standard for the
- * detailed explanation and example of "closed".
- *
- * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
- *
- * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
- * back-refs, string slice (sequences), single, family(char-property) and class range. Single and
- * family may appears independently or within a class.
- *
- * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
- * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
- * matching.
- *
- * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
- * if their behavior is clearly specified.
- *
- * This method addresses that requirement for the "range" construct within in character class by computing
- * the additional characters that should be included to close the range under simple case folding:
- *
- * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
- * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
- * character is not already in the range, then that mapped character (typically lowercase) is added to
- * the expansion set.
- *
- * This allows regex character class "range" implementation to use the returned expansion set to support
- * additional case-insensitive matching, without duplicating characters already covered by the existing
- * regex range implementation. The expectation is the matching is done using both the uppercase and
- * lowercase forms of the input character, for example
- *
- *
- * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
- * @param start the starting code point of the character range
- * @param end the ending code point of the character range
- * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
- * those already in the range
- */
- public static int[] getClassRangeClosingCharacters(int start, int end) {
- int[] expanded = new int[expanded_case_cps.length];
- int off = 0;
- for (int cp : expanded_case_cps) {
- if (cp >= start && cp <= end) {
- int folding = expanded_case_map.get(cp);
- if (folding < start || folding > end) {
- expanded[off++] = folding;
- }
- }
- }
- return Arrays.copyOf(expanded, off);
- }
-}
diff --git a/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
new file mode 100644
index 0000000000000..cd8b7035b9751
--- /dev/null
+++ b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @summary tests unicode case-folding based String comparison and equality
+ * @bug 4397357
+ * @library /lib/testlibrary/java/lang
+ * @compile --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED
+ * UnicodeCaseFoldingTest.java
+ * @run junit/othervm --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED
+ * UnicodeCaseFoldingTest
+ */
+import java.nio.file.Files;
+import java.util.stream.Stream;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import jdk.internal.java.lang.CaseFolding;
+
+public class UnicodeCaseFoldingTest {
+
+ @Test
+ void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+ // (1) Verify the folding result matches expected
+ assertEquals(expected, CaseFolding.fold(source), "CaseFolding.fold()");
+
+ // (2) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), line); //"source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (3) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ // S=simple, for simple case folding. The simple case folding should still matches
+ var filter = "^.*; [S]; .*$";
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+
+ // (1) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (2) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
+ // Collect all code points that appear in CaseFolding.txt
+ var listed = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
+ .map(line -> Integer.parseInt(line.split("; ")[0], 16))
+ .collect(Collectors.toSet());
+
+ var failures = new ArrayList
- *
- * {@code
- *
- * ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
- * inRange(lower, Character.toLower(ch), upper) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch))
- * }
- *
- *