From de44ea74b6cd0416c7951da4f81c972aca9637a7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:36:31 +0100 Subject: [PATCH 01/14] =?UTF-8?q?Don=E2=80=99t=20invert=20twice=20on=20com?= =?UTF-8?q?parison=20queries,=20add=20support=20for=20null=20queries,=20al?= =?UTF-8?q?ign=20identity=20queries=20with=20the=20draft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/unicode/jsp/UnicodeSetUtilities.java | 10 +++++++- .../org/unicode/jsptest/TestUnicodeSet.java | 23 +++++++++++++++++-- docs/help/list-unicodeset.md | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 34eed8b30d..dd2075df27 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -250,13 +250,16 @@ private boolean applyPropertyAlias0( } UnicodeProperty otherProperty = null; boolean testCp = false; + boolean testNone = false; if (trimmedPropertyValue.length() > 1 && trimmedPropertyValue.startsWith("@") && trimmedPropertyValue.endsWith("@")) { String otherPropName = trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim(); - if ("cp".equalsIgnoreCase(otherPropName)) { + if (UnicodeProperty.equalNames("code point", otherPropName)) { testCp = true; + } else if (UnicodeProperty.equalNames("none", otherPropName)) { + testNone = true; } else { otherProperty = factory.getProperty(otherPropName); } @@ -270,8 +273,12 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(i, prop.getValue(i))) { set.add(i); } + invert = false; } + } else if (testNone) { + set = prop.getSet(UnicodeProperty.NULL_MATCHER); } else if (otherProperty != null) { + System.err.println(otherProperty + ", " + invert); set = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { String v1 = prop.getValue(i); @@ -279,6 +286,7 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(v1, v2)) { set.add(i); } + invert = false; } } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a68acbc108..eabddd810c 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -141,6 +141,25 @@ public void TestPretty() { logln(derived); } + @Test + public void TestInteriorlyNegatedComparison() { + checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); + checkSetsEqual( + "\\p{Uppercase≠@Changes_When_Lowercased@}", + "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); + } + + @Test + public void TestIdentityQuery() { + checkSetsEqual("\\p{NFKC_Casefold=@codepoint@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold=@Code_Point@}", "\\P{Changes_When_NFKC_Casefolded}"); + } + + @Test + public void TestNullQuery() { + checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + } + // public void TestAExemplars() { // checkProperties("[:exemplars_en:]", "[a]", "[\u0350]"); // } @@ -380,7 +399,7 @@ public void TestGC() { public void TestNF() { for (String nf : new String[] {"d", "c", "kd", "kc"}) { checkSetsEqual("[:isnf" + nf + ":]", "[:nf" + nf + "qc!=N:]"); - checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@cp@:]"); + checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@code point@:]"); } } @@ -479,7 +498,7 @@ public void TestSetSyntax() { checkProperties("\\p{isNFC}", "[:ASCII:]", "[\u212B]"); checkProperties("[:isNFC=no:]", "[\u212B]", "[:ASCII:]"); checkProperties("[:dt!=none:]&[:toNFD=/^\\p{ccc:0}/:]", "[\u00A0]", "[\u0340]"); - checkProperties("[:toLowercase!=@cp@:]", "[A-Z\u00C0]", "[abc]"); + checkProperties("[:toLowercase!=@code point@:]", "[A-Z\u00C0]", "[abc]"); checkProperties("[:toNfkc!=@toNfc@:]", "[\\u00A0]", "[abc]"); String trans1 = Common.NFKC_CF.transform("\u2065"); diff --git a/docs/help/list-unicodeset.md b/docs/help/list-unicodeset.md index 1d9f22ff82..9afa8c3a27 100644 --- a/docs/help/list-unicodeset.md +++ b/docs/help/list-unicodeset.md @@ -113,7 +113,7 @@ There is a special property "cp" that returns the code point itself. For example: * Find the characters whose lowercase is different: - [`\p{toLowercase!=@cp@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40cp%40%7D&g=) + [`\p{toLowercase!=@code point@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40code%20point%40%7D&g=) ## **Available Properties** @@ -157,7 +157,7 @@ then set the Group By box to the property name. 1. uca (the primary UCA weight -- after the CLDR transforms), 2. uca2 (the primary and secondary weights) -Normally, \\p{isX} is equivalent to `\p{toX=@cp@}`. There are some exceptions and +Normally, \\p{isX} is equivalent to `\p{toX=@code point@}`. There are some exceptions and missing cases. Note: The Unassigned, Surrogate, and Private Use code points are skipped in the From 73485b303bcd94377b43fa8e1d91b0c07ccde4f9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:41:55 +0100 Subject: [PATCH 02/14] more tests --- .../test/java/org/unicode/jsptest/TestUnicodeSet.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index eabddd810c..15da98ca8a 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -144,6 +144,10 @@ public void TestPretty() { @Test public void TestInteriorlyNegatedComparison() { checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); + checkSetsEqual( + "\\p{Uppercase≠@Changes_When_Lowercased@}", + "\\P{Uppercase=@Changes_When_Lowercased@}"); + checkSetsEqual( "\\p{Uppercase≠@Changes_When_Lowercased@}", "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); @@ -151,13 +155,14 @@ public void TestInteriorlyNegatedComparison() { @Test public void TestIdentityQuery() { - checkSetsEqual("\\p{NFKC_Casefold=@codepoint@}", "\\P{Changes_When_NFKC_Casefolded}"); - checkSetsEqual("\\p{NFKC_Casefold=@Code_Point@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold≠@Code_Point@}", "\\p{Changes_When_NFKC_Casefolded}"); } @Test public void TestNullQuery() { checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } // public void TestAExemplars() { From 99d5625723d058d958ebf195da9b62ceec6d1a27 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:59:51 +0100 Subject: [PATCH 03/14] LM3 is --- .../src/test/java/org/unicode/jsptest/TestUnicodeSet.java | 4 ++-- .../src/main/java/org/unicode/props/UnicodeProperty.java | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 15da98ca8a..fe817fa9d5 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -149,7 +149,7 @@ public void TestInteriorlyNegatedComparison() { "\\P{Uppercase=@Changes_When_Lowercased@}"); checkSetsEqual( - "\\p{Uppercase≠@Changes_When_Lowercased@}", + "\\p{Is_Uppercase≠@Changes_When_Lowercased@}", "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); } @@ -161,7 +161,7 @@ public void TestIdentityQuery() { @Test public void TestNullQuery() { - checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}"); checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 208d98f974..0169761b3c 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -691,8 +691,7 @@ public static int compareNames(String a, String b) { return toSkeleton(a).compareTo(toSkeleton(b)); } - /** Utility for managing property & non-string value aliases */ - // TODO account for special names, tibetan, hangul + /** Returns a representative of the equivalence class of source under UAX44-LM3. */ public static String toSkeleton(String source) { if (source == null) return null; StringBuffer skeletonBuffer = new StringBuffer(); @@ -713,6 +712,10 @@ public static String toSkeleton(String source) { } } } + while (skeletonBuffer.subSequence(0, 2).equals("is")) { + gotOne = true; + skeletonBuffer.delete(0, 2); + } if (!gotOne) return source; // avoid string creation return skeletonBuffer.toString(); } From 2bb2c23c9a41a023f7a8c4f6aa2297778ac755dc Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:06:59 +0100 Subject: [PATCH 04/14] comments --- .../src/main/java/org/unicode/jsp/UnicodeSetUtilities.java | 2 ++ .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index dd2075df27..15b726d4d6 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -264,6 +264,8 @@ private boolean applyPropertyAlias0( otherProperty = factory.getProperty(otherPropName); } } + // TODO(egg): Name and Name_Alias require special handling (UAX44-LM2), and + // treating Name_Alias as aliases for Name. boolean isAge = UnicodeProperty.equalNames("age", propertyName); if (prop != null) { UnicodeSet set; diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 0169761b3c..1e1f755497 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -720,7 +720,7 @@ public static String toSkeleton(String source) { return skeletonBuffer.toString(); } - // get the name skeleton + /** Returns a representative of the equivalence class of source under UAX44-LM2. */ public static String toNameSkeleton(String source) { if (source == null) return null; StringBuffer result = new StringBuffer(); From ae2750c6a432288d50437e1937ee568c4946828d Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:52:22 +0100 Subject: [PATCH 05/14] out of bounds --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 1e1f755497..ee1ef259b9 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -712,7 +712,7 @@ public static String toSkeleton(String source) { } } } - while (skeletonBuffer.subSequence(0, 2).equals("is")) { + while (skeletonBuffer.length() >= 2 && skeletonBuffer.subSequence(0, 2).equals("is")) { gotOne = true; skeletonBuffer.delete(0, 2); } From 244f2e26c3aaf8408d28f337ff1ab16a726720ec Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:59:51 +0100 Subject: [PATCH 06/14] Check lb=@none@ (though that should probably be an error). --- .../src/test/java/org/unicode/jsptest/TestUnicodeSet.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index fe817fa9d5..a47d607897 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -161,6 +161,12 @@ public void TestIdentityQuery() { @Test public void TestNullQuery() { + // Check that we are not falling into the trap described in + // https://www.unicode.org/reports/tr44/#UAX44-LM3. + checkProperties("\\p{lb=IS}", "[,.:;]"); + // TODO(egg): This should perhaps be an error. But if it is not an error, it + // should be empty. + checkSetsEqual("\\p{lb=@none@}", "[]"); checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}"); checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } From 30bca05aa4cd44ed3c3e228838f68a95a4c5cefe Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 21:16:05 +0100 Subject: [PATCH 07/14] Millionfold falsification --- .../src/main/java/org/unicode/jsp/UnicodeSetUtilities.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 15b726d4d6..e4f323a3db 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -275,8 +275,8 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(i, prop.getValue(i))) { set.add(i); } - invert = false; } + invert = false; } else if (testNone) { set = prop.getSet(UnicodeProperty.NULL_MATCHER); } else if (otherProperty != null) { @@ -288,8 +288,8 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(v1, v2)) { set.add(i); } - invert = false; } + invert = false; } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { throw new IllegalArgumentException( From 1c157fb06ee7e5fe811ff0cbc160beab2c3b11ab Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 04:40:45 +0100 Subject: [PATCH 08/14] =?UTF-8?q?Need=20to=20figure=20out=20how=20to=20mak?= =?UTF-8?q?e=20Name=5FAlias=20behave=20as=20an=20alias=20for=20Name?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/unicode/jsptest/TestUnicodeSet.java | 29 ++++++++++++++ .../unicode/props/IndexUnicodeProperties.java | 1 + .../unicode/props/PropertyParsingInfo.java | 4 +- .../org/unicode/props/UnicodeProperty.java | 39 ++++++++++++++++--- 4 files changed, 66 insertions(+), 7 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a47d607897..87324472c5 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -37,6 +37,8 @@ import org.unicode.jsp.UnicodeSetUtilities; import org.unicode.jsp.UnicodeUtilities; import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; import org.unicode.props.UnicodeProperty; public class TestUnicodeSet extends TestFmwk2 { @@ -153,6 +155,33 @@ public void TestInteriorlyNegatedComparison() { "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); } + @Test + public void TestNameMatching() { + // UAX44-LM2 for both Name and Name_Alias. + checkSetsEqual("\\p{Name=NO-BREAK SPACE}", "[\\xA0]"); + checkSetsEqual("\\p{Name=no break space}", "[\\xA0]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]"); + checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]"); + checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]"); + checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]"); + checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]"); + checkSetsEqual("\\p{Name=TIBETAN MARK BKA- SHOG YIG MGO}", "[\\u0F0A]"); + checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]"); + checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]"); + checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]"); + } + + @Test + public void TestNameAliases() { + // Name_Alias values behave as aliases for Name, but not vice-versa. + checkSetsEqual("\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]"); + checkSetsEqual("\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); + checkSetsEqual("\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[]"); + checkSetsEqual("\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); + checkProperties("\\p{Name_Alias=@none@}", "[a-z]"); + } + @Test public void TestIdentityQuery() { checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}"); diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index bdbd14e582..b9c68d61d8 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -196,6 +196,7 @@ public Map getCacheFileSize() { static final Transform fromNumericPinyin = Transliterator.getInstance("NumericPinyin-Latin;nfc"); + static final Merge MULTIVALUED_JOINER = new PropertyUtilities.Joiner("|"); static final Merge ALPHABETIC_JOINER = new Merge() { TreeSet sorted = new TreeSet(); diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index 5e60f04271..5b08551d36 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -884,7 +884,7 @@ private static void parsePropertyValueFile( && indexUnicodeProperties.ucdVersion.compareTo( VersionInfo.UNICODE_4_0) <= 0 - ? new PropertyUtilities.Joiner("|") + ? IndexUnicodeProperties.MULTIVALUED_JOINER : null; final var originalMultivaluedSplit = propInfo.multivaluedSplit; // The first version of kPrimaryNumeric had spaces in values. @@ -995,7 +995,7 @@ private static void parseNameAliasesFile( indexUnicodeProperties, nextProperties, propInfoSet, - IndexUnicodeProperties.ALPHABETIC_JOINER, + IndexUnicodeProperties.MULTIVALUED_JOINER, false); } } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index ee1ef259b9..10c8d03241 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -448,7 +448,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { ? NULL_MATCHER : new SimpleMatcher( propertyValue, - isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), + getName().equals("Name") || getName().equals("Name_Alias") + ? CHARACTER_NAME_COMPARATOR + : isType(STRING_OR_MISC_MASK) + ? null + : PROPERTY_COMPARATOR), result); } } @@ -720,8 +724,25 @@ public static String toSkeleton(String source) { return skeletonBuffer.toString(); } - /** Returns a representative of the equivalence class of source under UAX44-LM2. */ - public static String toNameSkeleton(String source) { + public static final Comparator CHARACTER_NAME_COMPARATOR = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return compareCharacterNames(o1, o2); + } + }; + + public static int compareCharacterNames(String a, String b) { + if (a == b) return 0; + if (a == null) return -1; + if (b == null) return 1; + return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false)); + } + + /** Returns a representative of the equivalence class of source under UAX44-LM2. + * If validate=true, checks that source contains only characters allowed in character names. + */ + public static String toNameSkeleton(String source, boolean validate) { if (source == null) return null; StringBuffer result = new StringBuffer(); // remove spaces, medial '-' @@ -741,18 +762,26 @@ public static String toNameSkeleton(String source) { || (i == source.length() - 2 && source.charAt(i - 1) == 'O' && source.charAt(i + 1) == 'E')) { - System.out.println("****** EXCEPTION " + source); + if (validate) { + System.out.println("****** EXCEPTION " + source); + } result.append(ch); } // otherwise don't copy - } else { + } else if (validate) { throw new IllegalArgumentException( "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); + } else if (ch != '_') { + result.append(Character.toUpperCase(ch)); } } return result.toString(); } + public static String toNameSkeleton(String source) { + return toNameSkeleton(source, true); + } + /** * These routines use the Java functions, because they only need to act on ASCII Changes space, * - into _, inserts _ between lower and UPPER. From 29bc3191b74a30d857a63995644f316dcb554690 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 00:07:45 +0100 Subject: [PATCH 09/14] =?UTF-8?q?Name=5FAlias=20as=20a=20Name=20alias;=20f?= =?UTF-8?q?ailing=20test=20(on=20ne=20fait=20pas=20d=E2=80=99omelette=20sa?= =?UTF-8?q?ns=20casser=20des=20=C5=93ufs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/unicode/jsp/UnicodeSetUtilities.java | 11 ++++++++++ .../org/unicode/jsptest/TestUnicodeSet.java | 21 +++++++++++++++---- .../unicode/props/IndexUnicodeProperties.java | 6 +++++- .../org/unicode/props/UnicodeProperty.java | 5 +++-- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 98266713ec..6e3e40f969 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -14,6 +14,8 @@ import java.util.Map; import java.util.regex.Pattern; import org.unicode.cldr.util.MultiComparator; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.PatternMatcher; @@ -340,6 +342,15 @@ private boolean applyPropertyAlias0( } } set = prop.getSet(propertyValue); + if (set.isEmpty() + && prop instanceof IndexUnicodeProperties.IndexUnicodeProperty + && prop.getName().equals("Name")) { + set = + ((IndexUnicodeProperties.IndexUnicodeProperty) prop) + .getFactory() + .getProperty(UcdProperty.Name_Alias) + .getSet(propertyValue); + } } } else if (isAge) { set = new UnicodeSet(); diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index e527ea970f..30e168c44d 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -182,6 +182,8 @@ public void TestNameMatching() { checkSetsEqual("\\p{Name=no break space}", "[\\xA0]"); checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]"); checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]"); + checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]"); + checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]"); checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]"); checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]"); checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]"); @@ -190,15 +192,26 @@ public void TestNameMatching() { checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]"); checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]"); checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]"); + // The medial hyphen is only significant in HANGUL JUNGSEONG O-E, not in arbitrary O-E/OE. + checkSetsEqual("\\p{Name=twoemdash}", "⸺"); + checkSetsEqual("\\p{Name=SeeNoEvil_Monkey}", "🙈"); + checkSetsEqual("\\p{Name=BALLET S-H-O-E-S}", "🩰"); + checkSetsEqual("[\\p{Name=LATIN SMALL LIGATURE O-E}uf]", "[œuf]"); } @Test public void TestNameAliases() { // Name_Alias values behave as aliases for Name, but not vice-versa. - checkSetsEqual("\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]"); - checkSetsEqual("\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); - checkSetsEqual("\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[]"); - checkSetsEqual("\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); + checkSetsEqual( + "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]"); + checkSetsEqual( + "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); + checkSetsEqual( + "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", + "[]"); + checkSetsEqual( + "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", + "[︘]"); checkProperties("\\p{Name_Alias=@none@}", "[a-z]"); } diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index 107cffab33..241cb7b921 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -685,7 +685,7 @@ public VersionInfo getUcdVersion() { // .get(toSkeleton(propertyAlias)); // } - class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { + public class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { private final UcdProperty prop; private final Map stringToNamedEnum; @@ -725,6 +725,10 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { } } + public IndexUnicodeProperties getFactory() { + return IndexUnicodeProperties.this; + } + @Override public boolean isTrivial() { return _getRawUnicodeMap().isEmpty() diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 10c8d03241..f07557b749 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -739,8 +739,9 @@ public static int compareCharacterNames(String a, String b) { return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false)); } - /** Returns a representative of the equivalence class of source under UAX44-LM2. - * If validate=true, checks that source contains only characters allowed in character names. + /** + * Returns a representative of the equivalence class of source under UAX44-LM2. If + * validate=true, checks that source contains only characters allowed in character names. */ public static String toNameSkeleton(String source, boolean validate) { if (source == null) return null; From 939c80b23dd8feed8e4bbb558e280878e642b1da Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 00:50:00 +0100 Subject: [PATCH 10/14] Put Humpty Dumpty together again. --- .../org/unicode/jsptest/TestUnicodeSet.java | 7 ++++ .../org/unicode/props/UnicodeProperty.java | 41 +++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 30e168c44d..a87f07ef89 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -184,7 +184,14 @@ public void TestNameMatching() { checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]"); checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]"); checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -E}", "[\\u1180]"); + checkSetsEqual("\\p{Name= HANGUL JUNGSEONG O-E }", "[\\u1180]"); + checkSetsEqual("\\p{Name=_HANGUL_JUNGSEONG_O-E_}", "[\\u1180]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-EO}", "[\\u117F]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE O}", "[\\u117F]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -EO}", "[]"); checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]"); + checkSetsEqual("\\p{Name=MARCHEN_LETTER_-A}", "[\\x{11C88}]"); checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]"); checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]"); checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]"); diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index f07557b749..43602878a3 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -750,19 +750,44 @@ public static String toNameSkeleton(String source, boolean validate) { // we can do this with char, since no surrogates are involved for (int i = 0; i < source.length(); ++i) { char ch = source.charAt(i); + final char uppercase = Character.toUpperCase(ch); + if (validate && uppercase != ch) { + throw new IllegalArgumentException( + "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); + } + ch = uppercase; if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') { result.append(ch); } else if (ch == ' ') { // don't copy ever } else if (ch == '-') { - // only copy non-medials AND trailing O-E - if (0 == i - || i == source.length() - 1 - || source.charAt(i - 1) == ' ' - || source.charAt(i + 1) == ' ' - || (i == source.length() - 2 - && source.charAt(i - 1) == 'O' - && source.charAt(i + 1) == 'E')) { + // Only copy a hyphen-minus if it is non-medial, or if it is + // the hyphen in U+1180 HANGUL JUNGSEONG O-E. + boolean medial; + if (0 == i || i == source.length() - 1) { + medial = false; // Name-initial or name-final. + } else { + final char preceding = Character.toUpperCase(source.charAt(i - 1)); + final char following = Character.toUpperCase(source.charAt(i + 1)); + medial = + (('0' <= preceding && preceding <= '9') + || ('A' <= preceding && preceding <= 'Z')) + && (('0' <= following && following <= '9') + || ('A' <= following && following <= 'Z')); + } + boolean is1180 = false; + if (medial + && i <= source.length() - 2 + && Character.toUpperCase(source.charAt(i + 1)) == 'E' + && result.toString().equals("HANGULJUNGSEONGO")) { + is1180 = true; + for (int j = i + 2; j < source.length(); ++j) { + if (source.charAt(j) != ' ' && source.charAt(j) != '_') { + is1180 = false; + } + } + } + if (!medial || is1180) { if (validate) { System.out.println("****** EXCEPTION " + source); } From ed26f6206c4c0aa3859efff35051dc65503d4f4e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 19:23:07 +0100 Subject: [PATCH 11/14] uppercase once Co-authored-by: Markus Scherer --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 43602878a3..23974c19e9 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -798,7 +798,7 @@ public static String toNameSkeleton(String source, boolean validate) { throw new IllegalArgumentException( "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); } else if (ch != '_') { - result.append(Character.toUpperCase(ch)); + result.append(ch); } } return result.toString(); From 2890745b9b3a444fb541a7267e776568a3ac01ed Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 19:38:24 +0100 Subject: [PATCH 12/14] =?UTF-8?q?Don=E2=80=99t=20yell=20about=20non-medial?= =?UTF-8?q?=20hyphens=20nor=20the=20hyphen=20in=20U+1180.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 23974c19e9..a15233ad9e 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -788,9 +788,6 @@ public static String toNameSkeleton(String source, boolean validate) { } } if (!medial || is1180) { - if (validate) { - System.out.println("****** EXCEPTION " + source); - } result.append(ch); } // otherwise don't copy From a91b6cb1ac91fce23e2c2aa230caf97d1c40bf4c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 19:39:22 +0100 Subject: [PATCH 13/14] s/ff/ild/g --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index a15233ad9e..e6fe2984d7 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -745,7 +745,7 @@ public static int compareCharacterNames(String a, String b) { */ public static String toNameSkeleton(String source, boolean validate) { if (source == null) return null; - StringBuffer result = new StringBuffer(); + StringBuilder result = new StringBuilder(); // remove spaces, medial '-' // we can do this with char, since no surrogates are involved for (int i = 0; i < source.length(); ++i) { From bcc8e29def9c7c45de35bcb71e88539552c59c6d Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 13 Mar 2025 19:43:16 +0100 Subject: [PATCH 14/14] isLetterOrDigit --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index e6fe2984d7..e54761b123 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -767,13 +767,9 @@ public static String toNameSkeleton(String source, boolean validate) { if (0 == i || i == source.length() - 1) { medial = false; // Name-initial or name-final. } else { - final char preceding = Character.toUpperCase(source.charAt(i - 1)); - final char following = Character.toUpperCase(source.charAt(i + 1)); medial = - (('0' <= preceding && preceding <= '9') - || ('A' <= preceding && preceding <= 'Z')) - && (('0' <= following && following <= '9') - || ('A' <= following && following <= 'Z')); + Character.isLetterOrDigit(source.charAt(i - 1)) + && Character.isLetterOrDigit(source.charAt(i + 1)); } boolean is1180 = false; if (medial