diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 34eed8b30d..e4f323a3db 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -250,17 +250,22 @@ private boolean applyPropertyAlias0( } UnicodeProperty otherProperty = null; boolean testCp = false; + boolean testNone = false; if (trimmedPropertyValue.length() > 1 && trimmedPropertyValue.startsWith("@") && trimmedPropertyValue.endsWith("@")) { String otherPropName = trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim(); - if ("cp".equalsIgnoreCase(otherPropName)) { + if (UnicodeProperty.equalNames("code point", otherPropName)) { testCp = true; + } else if (UnicodeProperty.equalNames("none", otherPropName)) { + testNone = true; } else { otherProperty = factory.getProperty(otherPropName); } } + // TODO(egg): Name and Name_Alias require special handling (UAX44-LM2), and + // treating Name_Alias as aliases for Name. boolean isAge = UnicodeProperty.equalNames("age", propertyName); if (prop != null) { UnicodeSet set; @@ -271,7 +276,11 @@ private boolean applyPropertyAlias0( set.add(i); } } + invert = false; + } else if (testNone) { + set = prop.getSet(UnicodeProperty.NULL_MATCHER); } else if (otherProperty != null) { + System.err.println(otherProperty + ", " + invert); set = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { String v1 = prop.getValue(i); @@ -280,6 +289,7 @@ private boolean applyPropertyAlias0( set.add(i); } } + invert = false; } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { throw new IllegalArgumentException( diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a68acbc108..a47d607897 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -141,6 +141,36 @@ public void TestPretty() { logln(derived); } + @Test + public void TestInteriorlyNegatedComparison() { + checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); + checkSetsEqual( + "\\p{Uppercase≠@Changes_When_Lowercased@}", + "\\P{Uppercase=@Changes_When_Lowercased@}"); + + checkSetsEqual( + "\\p{Is_Uppercase≠@Changes_When_Lowercased@}", + "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); + } + + @Test + public void TestIdentityQuery() { + checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}"); + checkSetsEqual("\\p{NFKC_Casefold≠@Code_Point@}", "\\p{Changes_When_NFKC_Casefolded}"); + } + + @Test + public void TestNullQuery() { + // Check that we are not falling into the trap described in + // https://www.unicode.org/reports/tr44/#UAX44-LM3. + checkProperties("\\p{lb=IS}", "[,.:;]"); + // TODO(egg): This should perhaps be an error. But if it is not an error, it + // should be empty. + checkSetsEqual("\\p{lb=@none@}", "[]"); + checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}"); + checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}"); + } + // public void TestAExemplars() { // checkProperties("[:exemplars_en:]", "[a]", "[\u0350]"); // } @@ -380,7 +410,7 @@ public void TestGC() { public void TestNF() { for (String nf : new String[] {"d", "c", "kd", "kc"}) { checkSetsEqual("[:isnf" + nf + ":]", "[:nf" + nf + "qc!=N:]"); - checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@cp@:]"); + checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@code point@:]"); } } @@ -479,7 +509,7 @@ public void TestSetSyntax() { checkProperties("\\p{isNFC}", "[:ASCII:]", "[\u212B]"); checkProperties("[:isNFC=no:]", "[\u212B]", "[:ASCII:]"); checkProperties("[:dt!=none:]&[:toNFD=/^\\p{ccc:0}/:]", "[\u00A0]", "[\u0340]"); - checkProperties("[:toLowercase!=@cp@:]", "[A-Z\u00C0]", "[abc]"); + checkProperties("[:toLowercase!=@code point@:]", "[A-Z\u00C0]", "[abc]"); checkProperties("[:toNfkc!=@toNfc@:]", "[\\u00A0]", "[abc]"); String trans1 = Common.NFKC_CF.transform("\u2065"); diff --git a/docs/help/list-unicodeset.md b/docs/help/list-unicodeset.md index 1d9f22ff82..9afa8c3a27 100644 --- a/docs/help/list-unicodeset.md +++ b/docs/help/list-unicodeset.md @@ -113,7 +113,7 @@ There is a special property "cp" that returns the code point itself. For example: * Find the characters whose lowercase is different: - [`\p{toLowercase!=@cp@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40cp%40%7D&g=) + [`\p{toLowercase!=@code point@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40code%20point%40%7D&g=) ## **Available Properties** @@ -157,7 +157,7 @@ then set the Group By box to the property name. 1. uca (the primary UCA weight -- after the CLDR transforms), 2. uca2 (the primary and secondary weights) -Normally, \\p{isX} is equivalent to `\p{toX=@cp@}`. There are some exceptions and +Normally, \\p{isX} is equivalent to `\p{toX=@code point@}`. There are some exceptions and missing cases. Note: The Unassigned, Surrogate, and Private Use code points are skipped in the diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 208d98f974..ee1ef259b9 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -691,8 +691,7 @@ public static int compareNames(String a, String b) { return toSkeleton(a).compareTo(toSkeleton(b)); } - /** Utility for managing property & non-string value aliases */ - // TODO account for special names, tibetan, hangul + /** Returns a representative of the equivalence class of source under UAX44-LM3. */ public static String toSkeleton(String source) { if (source == null) return null; StringBuffer skeletonBuffer = new StringBuffer(); @@ -713,11 +712,15 @@ public static String toSkeleton(String source) { } } } + while (skeletonBuffer.length() >= 2 && skeletonBuffer.subSequence(0, 2).equals("is")) { + gotOne = true; + skeletonBuffer.delete(0, 2); + } if (!gotOne) return source; // avoid string creation return skeletonBuffer.toString(); } - // get the name skeleton + /** Returns a representative of the equivalence class of source under UAX44-LM2. */ public static String toNameSkeleton(String source) { if (source == null) return null; StringBuffer result = new StringBuffer();