diff --git a/jdk/make/data/characterdata/CharacterData00.java.template b/jdk/make/data/characterdata/CharacterData00.java.template index 388072a4642..eb794790d27 100644 --- a/jdk/make/data/characterdata/CharacterData00.java.template +++ b/jdk/make/data/characterdata/CharacterData00.java.template @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -106,9 +106,11 @@ class CharacterData00 extends CharacterData { boolean isJavaIdentifierStart(int ch) { // isJavaIdentifierStart strictly conforms to code points assigned - // in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not - // from Unicode 6.2, return false. - if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF)) + // in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and + // {9FCD..9FEF} are not from Unicode 6.2, return false. + if(ch == 0x32FF || + (ch>= 0x20BB && ch<= 0x20BF) || + (ch>= 0x9FCD && ch<= 0x9FEF)) return false; int props = getProperties(ch); return ((props & $$maskIdentifierInfo) >= $$lowJavaStart); @@ -116,9 +118,11 @@ class CharacterData00 extends CharacterData { boolean isJavaIdentifierPart(int ch) { // isJavaIdentifierPart strictly conforms to code points assigned - // in Unicode 6.2. Since code points {32FF} and {20BB..20BF} are not - // from Unicode 6.2, return false. - if(ch == 0x32FF || (ch>= 0x20BB && ch<= 0x20BF)) + // in Unicode 6.2. Since code points {32FF}, {20BB..20BF}, and + // {9FCD..9FEF} are not from Unicode 6.2, return false. + if(ch == 0x32FF || + (ch>= 0x20BB && ch<= 0x20BF) || + (ch>= 0x9FCD && ch<= 0x9FEF)) return false; int props = getProperties(ch); return ((props & $$nonzeroJavaPart) != 0); diff --git a/jdk/make/data/unicodedata/UnicodeData.txt b/jdk/make/data/unicodedata/UnicodeData.txt index 062805885d9..a121b68d62e 100644 --- a/jdk/make/data/unicodedata/UnicodeData.txt +++ b/jdk/make/data/unicodedata/UnicodeData.txt @@ -11732,7 +11732,7 @@ 4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;; 4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;; 4E00;;Lo;0;L;;;;;N;;;;; -9FCC;;Lo;0;L;;;;;N;;;;; +9FEF;;Lo;0;L;;;;;N;;;;; A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;; A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;; A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;; diff --git a/jdk/src/share/classes/java/lang/Character.java b/jdk/src/share/classes/java/lang/Character.java index 293a33c9a7f..e7f994619ba 100644 --- a/jdk/src/share/classes/java/lang/Character.java +++ b/jdk/src/share/classes/java/lang/Character.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -52,13 +52,17 @@ * http://www.unicode.org. *

* The Java SE 8 Platform uses character information from version 6.2 - * of the Unicode Standard, with two extensions. First, the Java SE 8 Platform - * allows an implementation of class {@code Character} to use the Japanese Era - * code point, {@code U+32FF}, from the first version of the Unicode Standard - * after 6.2 that assigns the code point. Second, in recognition of the fact + * of the Unicode Standard, with three extensions. First, in recognition of the fact * that new currencies appear frequently, the Java SE 8 Platform allows an * implementation of class {@code Character} to use the Currency Symbols - * block from version 10.0 of the Unicode Standard. Consequently, the + * block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform + * allows an implementation of class {@code Character} to use the code points + * in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the + * Unicode Standard, in order for the class to allow the "Implementation + * Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform + * allows an implementation of class {@code Character} to use the Japanese Era + * code point, {@code U+32FF}, from the Unicode Standard version 12.1. + * Consequently, the * behavior of fields and methods of class {@code Character} may vary across * implementations of the Java SE 8 Platform when processing the aforementioned * code points ( outside of version 6.2 ), except for the following methods diff --git a/jdk/test/java/lang/Character/Scripts.txt b/jdk/test/java/lang/Character/Scripts.txt index c6e63c4525d..d9d21393f44 100644 --- a/jdk/test/java/lang/Character/Scripts.txt +++ b/jdk/test/java/lang/Character/Scripts.txt @@ -1433,7 +1433,7 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK 3038..303A ; Han # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY 303B ; Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK 3400..4DB5 ; Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5 -4E00..9FCC ; Han # Lo [20941] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FCC +4E00..9FEF ; Han # Lo [20976] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FEF F900..FA6D ; Han # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6 @@ -1441,7 +1441,7 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D 2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D -# Total code points: 75963 +# Total code points: 75998 # ================================================ diff --git a/jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java b/jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java index 3e788ea0d10..8a0bad21a8b 100644 --- a/jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java +++ b/jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,22 +28,16 @@ * @bug 8218915 */ -import java.util.List; -import java.util.ArrayList; - public class TestIsJavaIdentifierMethods { - - // List of new code points are not present in Unicode 6.2. - private static final List UNASSIGNED_CODEPOINTS_IN_6_2 - = new ArrayList() - {{ - add(0x20BB); // NORDIC MARK SIGN - add(0x20BC); // MANAT SIGN - add(0x20BD); // RUBLE SIGN - add(0x20BE); // LARI SIGN - add(0x20BF); // BITCOIN SIGN - add(0x32FF); // SQUARE ERA NAME NEWERA - }}; + // Unassigned code points not present in Unicode 6.2 (which Java SE 8 + // is based upon), including: various currency symbol sign code points + // (Nordic Mark ... Bitcoin), Japanese Era Square character code point, + // and 35 CJK Unified Ideograph code points from GB18030-2022 + private static final int CS_SIGNS_CODEPOINT_START = 0x20BB; + private static final int CS_SIGNS_CODEPOINT_END = 0x20BF; + private static final int JAPANESE_ERA_CODEPOINT = 0x32FF; + private static final int GB18030_2022_CODEPOINT_START = 0x9FCD; + private static final int GB18030_2022_CODEPOINT_END = 0x9FEF; public static void main(String[] args) { testIsJavaIdentifierPart_int(); @@ -75,14 +69,15 @@ public static void main(String[] args) { public static void testIsJavaIdentifierPart_int() { for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) { boolean expected = false; - // Since Character.isJavaIdentifierPart(int) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (cp != JAPANESE_ERA_CODEPOINT && + !(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) && + !(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(cp); expected = Character.isLetter(cp) || type == Character.CURRENCY_SYMBOL @@ -124,11 +119,13 @@ public static void testIsJavaIdentifierPart_char() { boolean expected = false; // Since Character.isJavaIdentifierPart(char) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (i != JAPANESE_ERA_CODEPOINT && + !(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) && + !(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(ch); expected = Character.isLetter(ch) || type == Character.CURRENCY_SYMBOL @@ -165,11 +162,13 @@ public static void testIsJavaIdentifierStart_int() { boolean expected = false; // Since Character.isJavaIdentifierStart(int) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(cp)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (cp != JAPANESE_ERA_CODEPOINT && + !(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) && + !(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(cp); expected = Character.isLetter(cp) || type == Character.LETTER_NUMBER @@ -203,11 +202,13 @@ public static void testIsJavaIdentifierStart_char() { boolean expected = false; // Since Character.isJavaIdentifierStart(char) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (i != JAPANESE_ERA_CODEPOINT && + !(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) && + !(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(ch); expected = Character.isLetter(ch) || type == Character.LETTER_NUMBER @@ -241,11 +242,13 @@ public static void testIsJavaLetter() { boolean expected = false; // Since Character.isJavaLetter(char) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (i != JAPANESE_ERA_CODEPOINT && + !(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) && + !(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(ch); expected = Character.isLetter(ch) || type == Character.LETTER_NUMBER @@ -283,11 +286,13 @@ public static void testIsJavaLetterOrDigit() { boolean expected = false; // Since Character.isJavaLetterOrDigit(char) strictly conforms to // character information from version 6.2 of the Unicode Standard, - // check if code point is in "UNASSIGNED_CODEPOINTS_IN_6_2" - // list. If the code point is found in list - // "UNASSIGNED_CODEPOINTS_IN_6_2", value of variable - // "expected" is considered false. - if (!UNASSIGNED_CODEPOINTS_IN_6_2.contains(i)) { + // check if code point is one of the extra unassigned + // code points (defined at the beginning of the file). If the code + // point is found to be one of the unassigned code points, + // value of variable "expected" is considered false. + if (i != JAPANESE_ERA_CODEPOINT && + !(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) && + !(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) { byte type = (byte) Character.getType(ch); expected = Character.isLetter(ch) || type == Character.CURRENCY_SYMBOL