From de44ea74b6cd0416c7951da4f81c972aca9637a7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:36:31 +0100 Subject: [PATCH 1/7] =?UTF-8?q?Don=E2=80=99t=20invert=20twice=20on=20compa?= =?UTF-8?q?rison=20queries,=20add=20support=20for=20null=20queries,=20alig?= =?UTF-8?q?n=20identity=20queries=20with=20the=20draft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/unicode/jsp/UnicodeSetUtilities.java | 10 +++++++- .../org/unicode/jsptest/TestUnicodeSet.java | 23 +++++++++++++++++-- docs/help/list-unicodeset.md | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 34eed8b30d..dd2075df27 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -250,13 +250,16 @@ private boolean applyPropertyAlias0( } UnicodeProperty otherProperty = null; boolean testCp = false; + boolean testNone = false; if (trimmedPropertyValue.length() > 1 && trimmedPropertyValue.startsWith("@") && trimmedPropertyValue.endsWith("@")) { String otherPropName = trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim(); - if ("cp".equalsIgnoreCase(otherPropName)) { + if (UnicodeProperty.equalNames("code point", otherPropName)) { testCp = true; + } else if (UnicodeProperty.equalNames("none", otherPropName)) { + testNone = true; } else { otherProperty = factory.getProperty(otherPropName); } @@ -270,8 +273,12 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(i, prop.getValue(i))) { set.add(i); } + invert = false; } + } else if (testNone) { + set = prop.getSet(UnicodeProperty.NULL_MATCHER); } else if (otherProperty != null) { + System.err.println(otherProperty + ", " + invert); set = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { String v1 = prop.getValue(i); @@ -279,6 +286,7 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(v1, v2)) { set.add(i); } + invert = false; } } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a68acbc108..eabddd810c 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -141,6 +141,25 @@ public void TestPretty() { logln(derived); } + @Test + public void TestInteriorlyNegatedComparison() { + checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); + checkSetsEqual( + "\\p{Uppercase≠@Changes_When_Lowercased@}", + "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); + } + + @Test + public void TestIdentityQuery() { + checkSetsEqual("\\p{NFKC_Casefold=@codepoint@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold=@Code_Point@}", "\\P{Changes_When_NFKC_Casefolded}"); + } + + @Test + public void TestNullQuery() { + checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + } + // public void TestAExemplars() { // checkProperties("[:exemplars_en:]", "[a]", "[\u0350]"); // } @@ -380,7 +399,7 @@ public void TestGC() { public void TestNF() { for (String nf : new String[] {"d", "c", "kd", "kc"}) { checkSetsEqual("[:isnf" + nf + ":]", "[:nf" + nf + "qc!=N:]"); - checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@cp@:]"); + checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@code point@:]"); } } @@ -479,7 +498,7 @@ public void TestSetSyntax() { checkProperties("\\p{isNFC}", "[:ASCII:]", "[\u212B]"); checkProperties("[:isNFC=no:]", "[\u212B]", "[:ASCII:]"); checkProperties("[:dt!=none:]&[:toNFD=/^\\p{ccc:0}/:]", "[\u00A0]", "[\u0340]"); - checkProperties("[:toLowercase!=@cp@:]", "[A-Z\u00C0]", "[abc]"); + checkProperties("[:toLowercase!=@code point@:]", "[A-Z\u00C0]", "[abc]"); checkProperties("[:toNfkc!=@toNfc@:]", "[\\u00A0]", "[abc]"); String trans1 = Common.NFKC_CF.transform("\u2065"); diff --git a/docs/help/list-unicodeset.md b/docs/help/list-unicodeset.md index 1d9f22ff82..9afa8c3a27 100644 --- a/docs/help/list-unicodeset.md +++ b/docs/help/list-unicodeset.md @@ -113,7 +113,7 @@ There is a special property "cp" that returns the code point itself. For example: * Find the characters whose lowercase is different: - [`\p{toLowercase!=@cp@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40cp%40%7D&g=) + [`\p{toLowercase!=@code point@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40code%20point%40%7D&g=) ## **Available Properties** @@ -157,7 +157,7 @@ then set the Group By box to the property name. 1. uca (the primary UCA weight -- after the CLDR transforms), 2. uca2 (the primary and secondary weights) -Normally, \\p{isX} is equivalent to `\p{toX=@cp@}`. There are some exceptions and +Normally, \\p{isX} is equivalent to `\p{toX=@code point@}`. There are some exceptions and missing cases. Note: The Unassigned, Surrogate, and Private Use code points are skipped in the From 73485b303bcd94377b43fa8e1d91b0c07ccde4f9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:41:55 +0100 Subject: [PATCH 2/7] more tests --- .../test/java/org/unicode/jsptest/TestUnicodeSet.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index eabddd810c..15da98ca8a 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -144,6 +144,10 @@ public void TestPretty() { @Test public void TestInteriorlyNegatedComparison() { checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); + checkSetsEqual( + "\\p{Uppercase≠@Changes_When_Lowercased@}", + "\\P{Uppercase=@Changes_When_Lowercased@}"); + checkSetsEqual( "\\p{Uppercase≠@Changes_When_Lowercased@}", "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); @@ -151,13 +155,14 @@ public void TestInteriorlyNegatedComparison() { @Test public void TestIdentityQuery() { - checkSetsEqual("\\p{NFKC_Casefold=@codepoint@}", "\\P{Changes_When_NFKC_Casefolded}"); - checkSetsEqual("\\p{NFKC_Casefold=@Code_Point@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold≠@Code_Point@}", "\\p{Changes_When_NFKC_Casefolded}"); } @Test public void TestNullQuery() { checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } // public void TestAExemplars() { From 99d5625723d058d958ebf195da9b62ceec6d1a27 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 19:59:51 +0100 Subject: [PATCH 3/7] LM3 is --- .../src/test/java/org/unicode/jsptest/TestUnicodeSet.java | 4 ++-- .../src/main/java/org/unicode/props/UnicodeProperty.java | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 15da98ca8a..fe817fa9d5 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -149,7 +149,7 @@ public void TestInteriorlyNegatedComparison() { "\\P{Uppercase=@Changes_When_Lowercased@}"); checkSetsEqual( - "\\p{Uppercase≠@Changes_When_Lowercased@}", + "\\p{Is_Uppercase≠@Changes_When_Lowercased@}", "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); } @@ -161,7 +161,7 @@ public void TestIdentityQuery() { @Test public void TestNullQuery() { - checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=None}"); + checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}"); checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 208d98f974..0169761b3c 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -691,8 +691,7 @@ public static int compareNames(String a, String b) { return toSkeleton(a).compareTo(toSkeleton(b)); } - /** Utility for managing property & non-string value aliases */ - // TODO account for special names, tibetan, hangul + /** Returns a representative of the equivalence class of source under UAX44-LM3. */ public static String toSkeleton(String source) { if (source == null) return null; StringBuffer skeletonBuffer = new StringBuffer(); @@ -713,6 +712,10 @@ public static String toSkeleton(String source) { } } } + while (skeletonBuffer.subSequence(0, 2).equals("is")) { + gotOne = true; + skeletonBuffer.delete(0, 2); + } if (!gotOne) return source; // avoid string creation return skeletonBuffer.toString(); } From 2bb2c23c9a41a023f7a8c4f6aa2297778ac755dc Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:06:59 +0100 Subject: [PATCH 4/7] comments --- .../src/main/java/org/unicode/jsp/UnicodeSetUtilities.java | 2 ++ .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index dd2075df27..15b726d4d6 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -264,6 +264,8 @@ private boolean applyPropertyAlias0( otherProperty = factory.getProperty(otherPropName); } } + // TODO(egg): Name and Name_Alias require special handling (UAX44-LM2), and + // treating Name_Alias as aliases for Name. boolean isAge = UnicodeProperty.equalNames("age", propertyName); if (prop != null) { UnicodeSet set; diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 0169761b3c..1e1f755497 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -720,7 +720,7 @@ public static String toSkeleton(String source) { return skeletonBuffer.toString(); } - // get the name skeleton + /** Returns a representative of the equivalence class of source under UAX44-LM2. */ public static String toNameSkeleton(String source) { if (source == null) return null; StringBuffer result = new StringBuffer(); From ae2750c6a432288d50437e1937ee568c4946828d Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:52:22 +0100 Subject: [PATCH 5/7] out of bounds --- .../src/main/java/org/unicode/props/UnicodeProperty.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 1e1f755497..ee1ef259b9 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -712,7 +712,7 @@ public static String toSkeleton(String source) { } } } - while (skeletonBuffer.subSequence(0, 2).equals("is")) { + while (skeletonBuffer.length() >= 2 && skeletonBuffer.subSequence(0, 2).equals("is")) { gotOne = true; skeletonBuffer.delete(0, 2); } From 244f2e26c3aaf8408d28f337ff1ab16a726720ec Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 20:59:51 +0100 Subject: [PATCH 6/7] Check lb=@none@ (though that should probably be an error). --- .../src/test/java/org/unicode/jsptest/TestUnicodeSet.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index fe817fa9d5..a47d607897 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -161,6 +161,12 @@ public void TestIdentityQuery() { @Test public void TestNullQuery() { + // Check that we are not falling into the trap described in + // https://www.unicode.org/reports/tr44/#UAX44-LM3. + checkProperties("\\p{lb=IS}", "[,.:;]"); + // TODO(egg): This should perhaps be an error. But if it is not an error, it + // should be empty. + checkSetsEqual("\\p{lb=@none@}", "[]"); checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}"); checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); } From 30bca05aa4cd44ed3c3e228838f68a95a4c5cefe Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 11 Mar 2025 21:16:05 +0100 Subject: [PATCH 7/7] Millionfold falsification --- .../src/main/java/org/unicode/jsp/UnicodeSetUtilities.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 15b726d4d6..e4f323a3db 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -275,8 +275,8 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(i, prop.getValue(i))) { set.add(i); } - invert = false; } + invert = false; } else if (testNone) { set = prop.getSet(UnicodeProperty.NULL_MATCHER); } else if (otherProperty != null) { @@ -288,8 +288,8 @@ private boolean applyPropertyAlias0( if (invert != UnicodeProperty.equals(v1, v2)) { set.add(i); } - invert = false; } + invert = false; } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { throw new IllegalArgumentException(