Skip to content

Commit c60a326

Browse files
authored
Don’t invert twice on comparison queries (#1057)
1 parent d29babc commit c60a326

File tree

4 files changed

+51
-8
lines changed

4 files changed

+51
-8
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,17 +250,22 @@ private boolean applyPropertyAlias0(
250250
}
251251
UnicodeProperty otherProperty = null;
252252
boolean testCp = false;
253+
boolean testNone = false;
253254
if (trimmedPropertyValue.length() > 1
254255
&& trimmedPropertyValue.startsWith("@")
255256
&& trimmedPropertyValue.endsWith("@")) {
256257
String otherPropName =
257258
trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim();
258-
if ("cp".equalsIgnoreCase(otherPropName)) {
259+
if (UnicodeProperty.equalNames("code point", otherPropName)) {
259260
testCp = true;
261+
} else if (UnicodeProperty.equalNames("none", otherPropName)) {
262+
testNone = true;
260263
} else {
261264
otherProperty = factory.getProperty(otherPropName);
262265
}
263266
}
267+
// TODO(egg): Name and Name_Alias require special handling (UAX44-LM2), and
268+
// treating Name_Alias as aliases for Name.
264269
boolean isAge = UnicodeProperty.equalNames("age", propertyName);
265270
if (prop != null) {
266271
UnicodeSet set;
@@ -271,7 +276,11 @@ private boolean applyPropertyAlias0(
271276
set.add(i);
272277
}
273278
}
279+
invert = false;
280+
} else if (testNone) {
281+
set = prop.getSet(UnicodeProperty.NULL_MATCHER);
274282
} else if (otherProperty != null) {
283+
System.err.println(otherProperty + ", " + invert);
275284
set = new UnicodeSet();
276285
for (int i = 0; i <= 0x10FFFF; ++i) {
277286
String v1 = prop.getValue(i);
@@ -280,6 +289,7 @@ private boolean applyPropertyAlias0(
280289
set.add(i);
281290
}
282291
}
292+
invert = false;
283293
} else if (patternMatcher == null) {
284294
if (!isValid(prop, propertyValue)) {
285295
throw new IllegalArgumentException(

UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,36 @@ public void TestPretty() {
141141
logln(derived);
142142
}
143143

144+
@Test
145+
public void TestInteriorlyNegatedComparison() {
146+
checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]");
147+
checkSetsEqual(
148+
"\\p{Uppercase≠@Changes_When_Lowercased@}",
149+
"\\P{Uppercase=@Changes_When_Lowercased@}");
150+
151+
checkSetsEqual(
152+
"\\p{Is_Uppercase≠@Changes_When_Lowercased@}",
153+
"[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]");
154+
}
155+
156+
@Test
157+
public void TestIdentityQuery() {
158+
checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}");
159+
checkSetsEqual("\\p{NFKC_Casefold≠@Code_Point@}", "\\p{Changes_When_NFKC_Casefolded}");
160+
}
161+
162+
@Test
163+
public void TestNullQuery() {
164+
// Check that we are not falling into the trap described in
165+
// https://www.unicode.org/reports/tr44/#UAX44-LM3.
166+
checkProperties("\\p{lb=IS}", "[,.:;]");
167+
// TODO(egg): This should perhaps be an error. But if it is not an error, it
168+
// should be empty.
169+
checkSetsEqual("\\p{lb=@none@}", "[]");
170+
checkSetsEqual("\\p{Bidi_Paired_Bracket=@none@}", "\\p{Bidi_Paired_Bracket_Type=Is_None}");
171+
checkSetsEqual("\\p{Bidi_Paired_Bracket≠@None@}", "\\p{Bidi_Paired_Bracket_Type≠None}");
172+
}
173+
144174
// public void TestAExemplars() {
145175
// checkProperties("[:exemplars_en:]", "[a]", "[\u0350]");
146176
// }
@@ -380,7 +410,7 @@ public void TestGC() {
380410
public void TestNF() {
381411
for (String nf : new String[] {"d", "c", "kd", "kc"}) {
382412
checkSetsEqual("[:isnf" + nf + ":]", "[:nf" + nf + "qc!=N:]");
383-
checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@cp@:]");
413+
checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@code point@:]");
384414
}
385415
}
386416

@@ -479,7 +509,7 @@ public void TestSetSyntax() {
479509
checkProperties("\\p{isNFC}", "[:ASCII:]", "[\u212B]");
480510
checkProperties("[:isNFC=no:]", "[\u212B]", "[:ASCII:]");
481511
checkProperties("[:dt!=none:]&[:toNFD=/^\\p{ccc:0}/:]", "[\u00A0]", "[\u0340]");
482-
checkProperties("[:toLowercase!=@cp@:]", "[A-Z\u00C0]", "[abc]");
512+
checkProperties("[:toLowercase!=@code point@:]", "[A-Z\u00C0]", "[abc]");
483513
checkProperties("[:toNfkc!=@toNfc@:]", "[\\u00A0]", "[abc]");
484514

485515
String trans1 = Common.NFKC_CF.transform("\u2065");

docs/help/list-unicodeset.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ There is a special property "cp" that returns the code point itself. For
113113
example:
114114

115115
* Find the characters whose lowercase is different:
116-
[`\p{toLowercase!=@cp@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40cp%40%7D&g=)
116+
[`\p{toLowercase!=@code point@}`](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BtoLowercase!%3D%40code%20point%40%7D&g=)
117117

118118
## **Available Properties**
119119

@@ -157,7 +157,7 @@ then set the Group By box to the property name.
157157
1. uca (the primary UCA weight -- after the CLDR transforms),
158158
2. uca2 (the primary and secondary weights)
159159

160-
Normally, \\p{isX} is equivalent to `\p{toX=@cp@}`. There are some exceptions and
160+
Normally, \\p{isX} is equivalent to `\p{toX=@code point@}`. There are some exceptions and
161161
missing cases.
162162

163163
Note: The Unassigned, Surrogate, and Private Use code points are skipped in the

unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -691,8 +691,7 @@ public static int compareNames(String a, String b) {
691691
return toSkeleton(a).compareTo(toSkeleton(b));
692692
}
693693

694-
/** Utility for managing property & non-string value aliases */
695-
// TODO account for special names, tibetan, hangul
694+
/** Returns a representative of the equivalence class of source under UAX44-LM3. */
696695
public static String toSkeleton(String source) {
697696
if (source == null) return null;
698697
StringBuffer skeletonBuffer = new StringBuffer();
@@ -713,11 +712,15 @@ public static String toSkeleton(String source) {
713712
}
714713
}
715714
}
715+
while (skeletonBuffer.length() >= 2 && skeletonBuffer.subSequence(0, 2).equals("is")) {
716+
gotOne = true;
717+
skeletonBuffer.delete(0, 2);
718+
}
716719
if (!gotOne) return source; // avoid string creation
717720
return skeletonBuffer.toString();
718721
}
719722

720-
// get the name skeleton
723+
/** Returns a representative of the equivalence class of source under UAX44-LM2. */
721724
public static String toNameSkeleton(String source) {
722725
if (source == null) return null;
723726
StringBuffer result = new StringBuffer();

0 commit comments

Comments
 (0)