Skip to content

Commit 15d0d61

Browse files
authored
Comparison of equivalence relations/partitions in invariants tests (#474)
1 parent 7e99605 commit 15d0d61

File tree

4 files changed

+221
-1
lines changed

4 files changed

+221
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
eclipse.preferences.version=1
22
encoding//src/main/java=UTF-8
3+
encoding//src/test/java=UTF-8
34
encoding/<project>=UTF-8

unicodetools-testutils/.settings/org.eclipse.jdt.core.prefs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@ org.eclipse.jdt.core.compiler.compliance=11
44
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
55
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
66
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
7+
org.eclipse.jdt.core.compiler.processAnnotations=disabled
78
org.eclipse.jdt.core.compiler.release=disabled
89
org.eclipse.jdt.core.compiler.source=11

unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import java.text.ParsePosition;
1717
import java.util.ArrayList;
1818
import java.util.Comparator;
19+
import java.util.HashMap;
1920
import java.util.LinkedHashSet;
2021
import java.util.List;
2122
import java.util.Map;
@@ -239,6 +240,8 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc
239240
showMapLine(line, pp);
240241
} else if (line.startsWith("Show")) {
241242
showLine(line, pp);
243+
} else if (line.startsWith("EquivalencesOf")) {
244+
equivalencesLine(line, pp);
242245
} else {
243246
testLine(line, pp);
244247
}
@@ -272,6 +275,187 @@ static class PropertyComparison {
272275
UnicodeProperty property2;
273276
}
274277

278+
private static void equivalencesLine(String line, ParsePosition pp) throws ParseException {
279+
pp.setIndex("EquivalencesOf".length());
280+
final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable);
281+
final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp);
282+
scan(PATTERN_WHITE_SPACE, line, pp, true);
283+
char relationOperator = line.charAt(pp.getIndex());
284+
pp.setIndex(pp.getIndex() + 1);
285+
final var rightProperty = CompoundProperty.of(LATEST_PROPS, line, pp);
286+
287+
boolean leftShouldImplyRight = false;
288+
boolean rightShouldImplyLeft = false;
289+
290+
boolean negated = true;
291+
switch (relationOperator) {
292+
case '⇍':
293+
relationOperator = '⇐';
294+
break;
295+
case '⇎':
296+
relationOperator = '⇔';
297+
break;
298+
case '⇏':
299+
relationOperator = '⇒';
300+
break;
301+
default:
302+
negated = false;
303+
}
304+
305+
switch (relationOperator) {
306+
case '⇐':
307+
rightShouldImplyLeft = true;
308+
break;
309+
case '⇔':
310+
leftShouldImplyRight = true;
311+
rightShouldImplyLeft = true;
312+
break;
313+
case '⇒':
314+
leftShouldImplyRight = true;
315+
break;
316+
default:
317+
throw new ParseException(line, pp.getIndex());
318+
}
319+
final var leftValues = new HashMap<String, String>();
320+
final var rightValues = new HashMap<String, String>();
321+
final var leftClasses = new HashMap<String, UnicodeSet>();
322+
final var rightClasses = new HashMap<String, UnicodeSet>();
323+
for (String element : domain) {
324+
final var leftValue = new StringBuilder();
325+
final var rightValue = new StringBuilder();
326+
for (int codepoint : element.codePoints().toArray()) {
327+
leftValue.append(leftProperty.getValue(codepoint));
328+
rightValue.append(rightProperty.getValue(codepoint));
329+
}
330+
leftValues.put(element, leftValue.toString());
331+
rightValues.put(element, rightValue.toString());
332+
leftClasses.computeIfAbsent(leftValue.toString(), (k) -> new UnicodeSet()).add(element);
333+
rightClasses
334+
.computeIfAbsent(rightValue.toString(), (k) -> new UnicodeSet())
335+
.add(element);
336+
}
337+
UnicodeSet remainingDomain = domain.cloneAsThawed();
338+
final var leftImpliesRightCounterexamples = new ArrayList<String>();
339+
final var rightImpliesLeftCounterexamples = new ArrayList<String>();
340+
341+
// For the implication ⇒, produce at most one counterexample per equivalence class of the
342+
// left-hand-side equivalence relation: we do not want an example per pair of Unicode code
343+
// points!
344+
if (leftShouldImplyRight) {
345+
while (!remainingDomain.isEmpty()) {
346+
String representative = remainingDomain.iterator().next();
347+
UnicodeSet leftEquivalenceClass = leftClasses.get(leftValues.get(representative));
348+
UnicodeSet rightEquivalenceClass =
349+
rightClasses.get(rightValues.get(representative));
350+
if (leftShouldImplyRight
351+
&& !rightEquivalenceClass.containsAll(leftEquivalenceClass)) {
352+
final String counterexampleRhs =
353+
leftEquivalenceClass
354+
.cloneAsThawed()
355+
.removeAll(rightEquivalenceClass)
356+
.iterator()
357+
.next();
358+
leftImpliesRightCounterexamples.add(
359+
"\t\t"
360+
+ leftProperty.getNameAliases()
361+
+ "("
362+
+ representative
363+
+ ") \t=\t "
364+
+ leftProperty.getNameAliases()
365+
+ "("
366+
+ counterexampleRhs
367+
+ ") \t=\t "
368+
+ leftValues.get(representative)
369+
+ " \tbut\t "
370+
+ rightValues.get(representative)
371+
+ " \t=\t "
372+
+ rightProperty.getNameAliases()
373+
+ "("
374+
+ representative
375+
+ ") \t\t "
376+
+ rightProperty.getNameAliases()
377+
+ "("
378+
+ counterexampleRhs
379+
+ ") \t=\t "
380+
+ rightValues.get(counterexampleRhs));
381+
}
382+
remainingDomain.removeAll(leftEquivalenceClass);
383+
}
384+
}
385+
386+
// Likewise, for the implication ⇐, produce at most one counterexample per equivalence class
387+
// of the
388+
// right-hand-side equivalence relation.
389+
remainingDomain = domain.cloneAsThawed();
390+
if (rightShouldImplyLeft) {
391+
while (!remainingDomain.isEmpty()) {
392+
String representative = remainingDomain.iterator().next();
393+
UnicodeSet leftEquivalenceClass = leftClasses.get(leftValues.get(representative));
394+
UnicodeSet rightEquivalenceClass =
395+
rightClasses.get(rightValues.get(representative));
396+
if (!leftEquivalenceClass.containsAll(rightEquivalenceClass)) {
397+
final String counterexampleRhs =
398+
rightEquivalenceClass
399+
.cloneAsThawed()
400+
.removeAll(leftEquivalenceClass)
401+
.iterator()
402+
.next();
403+
rightImpliesLeftCounterexamples.add(
404+
leftValues.get(representative)
405+
+ " \t=\t "
406+
+ leftProperty.getNameAliases()
407+
+ "("
408+
+ representative
409+
+ ") \t\t "
410+
+ leftProperty.getNameAliases()
411+
+ "("
412+
+ counterexampleRhs
413+
+ ") \t=\t "
414+
+ rightValues.get(counterexampleRhs)
415+
+ " \teven though\t "
416+
+ rightValues.get(representative)
417+
+ " \t=\t "
418+
+ rightProperty.getNameAliases()
419+
+ "("
420+
+ representative
421+
+ ") \t=\t "
422+
+ rightProperty.getNameAliases()
423+
+ "("
424+
+ counterexampleRhs
425+
+ ")\t\t");
426+
}
427+
remainingDomain.removeAll(rightEquivalenceClass);
428+
}
429+
}
430+
final var counterexamples = new ArrayList<>(leftImpliesRightCounterexamples);
431+
counterexamples.addAll(rightImpliesLeftCounterexamples);
432+
boolean failure = counterexamples.isEmpty() == negated;
433+
if (failure) {
434+
++testFailureCount;
435+
printErrorLine("Test Failure", Side.START, testFailureCount);
436+
}
437+
if (counterexamples.isEmpty()) {
438+
println("There are no counterexamples to " + relationOperator + ".");
439+
} else {
440+
if (leftShouldImplyRight) {
441+
println("The implication ⇒ is " + leftImpliesRightCounterexamples.isEmpty() + ".");
442+
}
443+
if (rightShouldImplyLeft) {
444+
println("The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + ".");
445+
}
446+
}
447+
out.println(failure ? "<table class='f'>" : "<table>");
448+
for (String counterexample : counterexamples) {
449+
out.println("<tr><td>");
450+
out.println(toHTML.transform(counterexample).replace("\t", "</td><td>"));
451+
out.println("</tr></td>");
452+
}
453+
out.println("</table>");
454+
if (failure) {
455+
printErrorLine("Test Failure", Side.END, testFailureCount);
456+
}
457+
}
458+
275459
private static void inLine(ParsePosition pp, String line) throws ParseException {
276460
pp.setIndex(2);
277461
final PropertyComparison propertyComparison = getPropertyComparison(pp, line);
@@ -800,6 +984,9 @@ private static void showSet(ParsePosition pp, final String value) {
800984
for (final UnicodeSetIterator it = new UnicodeSetIterator(valueSet);
801985
it.nextRange() && rangeLimit > 0;
802986
--rangeLimit) {
987+
if (it.codepoint == it.IS_STRING) {
988+
continue; // TODO(egg): Show strings too.
989+
}
803990
shorter.add(it.codepoint, it.codepointEnd);
804991
}
805992
abbreviated = totalSize - shorter.size();

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,28 @@
7474
# It then tests that against the result of filtering out NSM characters from X, then getting the BIDI_Class.
7575
#
7676
##########################
77+
# EquivalencesOf <unicodeSet> <props> (⇐|⇔|⇒|⇍|⇎|⇏) <props>
78+
#
79+
# Verify that the equivalence of elements of <unicodeSet> up to the left <props> is is implied,
80+
# equivalent, or implies equivalence up to the right <props>, or verify the negation of these
81+
# relations.
82+
#
83+
# On both sides, <props> is a composition of properties and filters, as in an “In” line.
84+
#
85+
# Example:
86+
# The case-insensitive comparison of ASCII identifiers defined by comparing their uppercase
87+
# mappings is equivalent to that defined by comparing their lowercase mappings:
88+
# EquivalencesOf \p{Block=Basic Latin} Uppercase_Mapping ⇔ Lowercase_Mapping
89+
# This is not true in the broader Latin script (in fact neither implication holds).
90+
# EquivalencesOf \p{Script=Latin} Uppercase_Mapping ⇎ Lowercase_Mapping
91+
# The simple and full case foldings do not define the same equivalence classes on these
92+
# strings:
93+
# Let $strings = [ {Straße} {STRASSE} {ᾠδή} {ὨΙΔΉ} {...} ]
94+
# EquivalencesOf $strings Case_Folding ⇎ Simple_Case_Folding
95+
# Specifically, full case folding is coarser than simple case folding.
96+
# EquivalencesOf $strings Case_Folding ⇏ Simple_Case_Folding
97+
# EquivalencesOf $strings Case_Folding ⇐ Simple_Case_Folding
98+
##########################
7799
# There is new syntax for testing UnicodeMaps
78100
#
79101
# Map <unicodeMap> <relation> <unicodeMap>
@@ -115,6 +137,8 @@
115137
Let $foo = \p{ccc=9}
116138
Let $fii = \p{toNFD=/$foo/}
117139

140+
Let $codepoints = [\u0000-\U0010FFFF]
141+
118142
Let $gcAllPunctuation = \p{gc=/_Punctuation/}
119143
$gcAllPunctuation = [\p{gc=Close_Punctuation}\p{gc=Connector_Punctuation}}\p{gc=Dash_Punctuation}\p{gc=Final_Punctuation}\p{gc=Initial_Punctuation}\p{gc=Open_Punctuation}\p{gc=Other_Punctuation}]
120144

@@ -268,6 +292,11 @@ Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F]
268292
In \p{sc=Cher} cf = uc
269293
In \p{sc=Cher} scf = suc
270294

295+
# Simple and full case foldings define the same equivalence classes on code points.
296+
# This used not to be true, but was rectified by
297+
# https://www.unicode.org/cgi-bin/GetL2Ref.pl?175-C19.
298+
EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding
299+
271300
# Stability: All characters with the Lowercase property and all characters with the Uppercase property have the Alphabetic property.
272301
\p{Alphabetic} ⊃ [\p{Uppercase} \p{Lowercase}]
273302

@@ -795,7 +824,9 @@ $punct ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]]
795824
# We should be able to use:
796825
# Map {\m{Case_Folding}&\P{U-1:gc=Cn}} = {\m{*Case_Folding}]}
797826

798-
Map {\m{Case_Folding}&[^\p{age=14.0}]} = {\m{*Case_Folding}&[^\p{age=14.0}]}
827+
# Commented out because excruciatingly slow, and tested systematically above with an In line.
828+
# TODO(macchiati): Make this faster.
829+
# Map {\m{Case_Folding}&[^\p{age=14.0}]} = {\m{*Case_Folding}&[^\p{age=14.0}]}
799830

800831
# The following are 'red flag' tests, just so that we review the changes and make sure they are ok.
801832

0 commit comments

Comments
 (0)