Comparison of equivalence relations/partitions in invariants tests (#474)

eggrobin · web-flow · commit 15d0d611f74c · 2023-05-17T14:12:39.000+02:00
diff --git a/unicodetools-testutils/.settings/org.eclipse.core.resources.prefs b/unicodetools-testutils/.settings/org.eclipse.core.resources.prefs
@@ -1,3 +1,4 @@
 eclipse.preferences.version=1
 encoding//src/main/java=UTF-8
+encoding//src/test/java=UTF-8
 encoding/<project>=UTF-8
diff --git a/unicodetools-testutils/.settings/org.eclipse.jdt.core.prefs b/unicodetools-testutils/.settings/org.eclipse.jdt.core.prefs
@@ -4,5 +4,6 @@ org.eclipse.jdt.core.compiler.compliance=11
 org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
 org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.processAnnotations=disabled
 org.eclipse.jdt.core.compiler.release=disabled
 org.eclipse.jdt.core.compiler.source=11
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -16,6 +16,7 @@
 import java.text.ParsePosition;
 import java.util.ArrayList;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
@@ -239,6 +240,8 @@ public static int testInvariants(String inputFile, boolean doRange) throws IOExc
                                 showMapLine(line, pp);
                             } else if (line.startsWith("Show")) {
                                 showLine(line, pp);
+                            } else if (line.startsWith("EquivalencesOf")) {
+                                equivalencesLine(line, pp);
                             } else {
                                 testLine(line, pp);
                             }
@@ -272,6 +275,187 @@ static class PropertyComparison {
         UnicodeProperty property2;
     }
 
+    private static void equivalencesLine(String line, ParsePosition pp) throws ParseException {
+        pp.setIndex("EquivalencesOf".length());
+        final UnicodeSet domain = new UnicodeSet(line, pp, symbolTable);
+        final var leftProperty = CompoundProperty.of(LATEST_PROPS, line, pp);
+        scan(PATTERN_WHITE_SPACE, line, pp, true);
+        char relationOperator = line.charAt(pp.getIndex());
+        pp.setIndex(pp.getIndex() + 1);
+        final var rightProperty = CompoundProperty.of(LATEST_PROPS, line, pp);
+
+        boolean leftShouldImplyRight = false;
+        boolean rightShouldImplyLeft = false;
+
+        boolean negated = true;
+        switch (relationOperator) {
+            case '⇍':
+                relationOperator = '⇐';
+                break;
+            case '⇎':
+                relationOperator = '⇔';
+                break;
+            case '⇏':
+                relationOperator = '⇒';
+                break;
+            default:
+                negated = false;
+        }
+
+        switch (relationOperator) {
+            case '⇐':
+                rightShouldImplyLeft = true;
+                break;
+            case '⇔':
+                leftShouldImplyRight = true;
+                rightShouldImplyLeft = true;
+                break;
+            case '⇒':
+                leftShouldImplyRight = true;
+                break;
+            default:
+                throw new ParseException(line, pp.getIndex());
+        }
+        final var leftValues = new HashMap<String, String>();
+        final var rightValues = new HashMap<String, String>();
+        final var leftClasses = new HashMap<String, UnicodeSet>();
+        final var rightClasses = new HashMap<String, UnicodeSet>();
+        for (String element : domain) {
+            final var leftValue = new StringBuilder();
+            final var rightValue = new StringBuilder();
+            for (int codepoint : element.codePoints().toArray()) {
+                leftValue.append(leftProperty.getValue(codepoint));
+                rightValue.append(rightProperty.getValue(codepoint));
+            }
+            leftValues.put(element, leftValue.toString());
+            rightValues.put(element, rightValue.toString());
+            leftClasses.computeIfAbsent(leftValue.toString(), (k) -> new UnicodeSet()).add(element);
+            rightClasses
+                    .computeIfAbsent(rightValue.toString(), (k) -> new UnicodeSet())
+                    .add(element);
+        }
+        UnicodeSet remainingDomain = domain.cloneAsThawed();
+        final var leftImpliesRightCounterexamples = new ArrayList<String>();
+        final var rightImpliesLeftCounterexamples = new ArrayList<String>();
+
+        // For the implication ⇒, produce at most one counterexample per equivalence class of the
+        // left-hand-side equivalence relation: we do not want an example per pair of Unicode code
+        // points!
+        if (leftShouldImplyRight) {
+            while (!remainingDomain.isEmpty()) {
+                String representative = remainingDomain.iterator().next();
+                UnicodeSet leftEquivalenceClass = leftClasses.get(leftValues.get(representative));
+                UnicodeSet rightEquivalenceClass =
+                        rightClasses.get(rightValues.get(representative));
+                if (leftShouldImplyRight
+                        && !rightEquivalenceClass.containsAll(leftEquivalenceClass)) {
+                    final String counterexampleRhs =
+                            leftEquivalenceClass
+                                    .cloneAsThawed()
+                                    .removeAll(rightEquivalenceClass)
+                                    .iterator()
+                                    .next();
+                    leftImpliesRightCounterexamples.add(
+                            "\t\t"
+                                    + leftProperty.getNameAliases()
+                                    + "("
+                                    + representative
+                                    + ") \t=\t "
+                                    + leftProperty.getNameAliases()
+                                    + "("
+                                    + counterexampleRhs
+                                    + ") \t=\t "
+                                    + leftValues.get(representative)
+                                    + " \tbut\t "
+                                    + rightValues.get(representative)
+                                    + " \t=\t "
+                                    + rightProperty.getNameAliases()
+                                    + "("
+                                    + representative
+                                    + ") \t≠\t "
+                                    + rightProperty.getNameAliases()
+                                    + "("
+                                    + counterexampleRhs
+                                    + ") \t=\t "
+                                    + rightValues.get(counterexampleRhs));
+                }
+                remainingDomain.removeAll(leftEquivalenceClass);
+            }
+        }
+
+        // Likewise, for the implication ⇐, produce at most one counterexample per equivalence class
+        // of the
+        // right-hand-side equivalence relation.
+        remainingDomain = domain.cloneAsThawed();
+        if (rightShouldImplyLeft) {
+            while (!remainingDomain.isEmpty()) {
+                String representative = remainingDomain.iterator().next();
+                UnicodeSet leftEquivalenceClass = leftClasses.get(leftValues.get(representative));
+                UnicodeSet rightEquivalenceClass =
+                        rightClasses.get(rightValues.get(representative));
+                if (!leftEquivalenceClass.containsAll(rightEquivalenceClass)) {
+                    final String counterexampleRhs =
+                            rightEquivalenceClass
+                                    .cloneAsThawed()
+                                    .removeAll(leftEquivalenceClass)
+                                    .iterator()
+                                    .next();
+                    rightImpliesLeftCounterexamples.add(
+                            leftValues.get(representative)
+                                    + " \t=\t "
+                                    + leftProperty.getNameAliases()
+                                    + "("
+                                    + representative
+                                    + ") \t≠\t "
+                                    + leftProperty.getNameAliases()
+                                    + "("
+                                    + counterexampleRhs
+                                    + ") \t=\t "
+                                    + rightValues.get(counterexampleRhs)
+                                    + " \teven though\t "
+                                    + rightValues.get(representative)
+                                    + " \t=\t "
+                                    + rightProperty.getNameAliases()
+                                    + "("
+                                    + representative
+                                    + ") \t=\t "
+                                    + rightProperty.getNameAliases()
+                                    + "("
+                                    + counterexampleRhs
+                                    + ")\t\t");
+                }
+                remainingDomain.removeAll(rightEquivalenceClass);
+            }
+        }
+        final var counterexamples = new ArrayList<>(leftImpliesRightCounterexamples);
+        counterexamples.addAll(rightImpliesLeftCounterexamples);
+        boolean failure = counterexamples.isEmpty() == negated;
+        if (failure) {
+            ++testFailureCount;
+            printErrorLine("Test Failure", Side.START, testFailureCount);
+        }
+        if (counterexamples.isEmpty()) {
+            println("There are no counterexamples to " + relationOperator + ".");
+        } else {
+            if (leftShouldImplyRight) {
+                println("The implication ⇒ is " + leftImpliesRightCounterexamples.isEmpty() + ".");
+            }
+            if (rightShouldImplyLeft) {
+                println("The implication ⇐ is " + rightImpliesLeftCounterexamples.isEmpty() + ".");
+            }
+        }
+        out.println(failure ? "<table class='f'>" : "<table>");
+        for (String counterexample : counterexamples) {
+            out.println("<tr><td>");
+            out.println(toHTML.transform(counterexample).replace("\t", "</td><td>"));
+            out.println("</tr></td>");
+        }
+        out.println("</table>");
+        if (failure) {
+            printErrorLine("Test Failure", Side.END, testFailureCount);
+        }
+    }
+
     private static void inLine(ParsePosition pp, String line) throws ParseException {
         pp.setIndex(2);
         final PropertyComparison propertyComparison = getPropertyComparison(pp, line);
@@ -800,6 +984,9 @@ private static void showSet(ParsePosition pp, final String value) {
             for (final UnicodeSetIterator it = new UnicodeSetIterator(valueSet);
                     it.nextRange() && rangeLimit > 0;
                     --rangeLimit) {
+                if (it.codepoint == it.IS_STRING) {
+                    continue; // TODO(egg): Show strings too.
+                }
                 shorter.add(it.codepoint, it.codepointEnd);
             }
             abbreviated = totalSize - shorter.size();
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -74,6 +74,28 @@
 #       It then tests that against the result of filtering out NSM characters from X, then getting the BIDI_Class.
 #
 ##########################
+# EquivalencesOf <unicodeSet> <props> (⇐|⇔|⇒|⇍|⇎|⇏) <props>
+# 
+# Verify that the equivalence of elements of <unicodeSet> up to the left <props> is is implied,
+# equivalent, or implies equivalence up to the right <props>, or verify the negation of these
+# relations.
+#
+# On both sides, <props> is a composition of properties and filters, as in an “In” line.
+#
+#   Example:
+#       The case-insensitive comparison of ASCII identifiers defined by comparing their uppercase
+#       mappings is equivalent to that defined by comparing their lowercase mappings:
+#          EquivalencesOf \p{Block=Basic Latin} Uppercase_Mapping ⇔ Lowercase_Mapping
+#       This is not true in the broader Latin script (in fact neither implication holds).
+#          EquivalencesOf \p{Script=Latin} Uppercase_Mapping ⇎ Lowercase_Mapping
+#       The simple and full case foldings do not define the same equivalence classes on these
+#       strings:
+#          Let $strings = [ {Straße} {STRASSE} {ᾠδή} {ὨΙΔΉ} {...} ]
+#          EquivalencesOf $strings Case_Folding ⇎ Simple_Case_Folding
+#       Specifically, full case folding is coarser than simple case folding.
+#          EquivalencesOf $strings Case_Folding ⇏ Simple_Case_Folding
+#          EquivalencesOf $strings Case_Folding ⇐ Simple_Case_Folding
+##########################
 # There is new syntax for testing UnicodeMaps
 #
 #  Map <unicodeMap> <relation> <unicodeMap>
@@ -115,6 +137,8 @@
 Let $foo = \p{ccc=9}
 Let $fii = \p{toNFD=/$foo/}
 
+Let $codepoints = [\u0000-\U0010FFFF]
+
 Let $gcAllPunctuation = \p{gc=/_Punctuation/}
 $gcAllPunctuation = [\p{gc=Close_Punctuation}\p{gc=Connector_Punctuation}}\p{gc=Dash_Punctuation}\p{gc=Final_Punctuation}\p{gc=Initial_Punctuation}\p{gc=Open_Punctuation}\p{gc=Other_Punctuation}]
 
@@ -268,6 +292,11 @@ Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F]
 In \p{sc=Cher} cf = uc
 In \p{sc=Cher} scf = suc
 
+# Simple and full case foldings define the same equivalence classes on code points.
+# This used not to be true, but was rectified by
+# https://www.unicode.org/cgi-bin/GetL2Ref.pl?175-C19.
+EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding
+
 # Stability: All characters with the Lowercase property and all characters with the Uppercase property have the Alphabetic property. 
 \p{Alphabetic} ⊃ [\p{Uppercase} \p{Lowercase}]
 
@@ -795,7 +824,9 @@ $punct  ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]]
 # We should be able to use:
 # Map {\m{Case_Folding}&\P{U-1:gc=Cn}} = {\m{*Case_Folding}]}
 
-Map {\m{Case_Folding}&[^\p{age=14.0}]} = {\m{*Case_Folding}&[^\p{age=14.0}]}
+# Commented out because excruciatingly slow, and tested systematically above with an In line.
+# TODO(macchiati): Make this faster.
+# Map {\m{Case_Folding}&[^\p{age=14.0}]} = {\m{*Case_Folding}&[^\p{age=14.0}]}
 
 # The following are 'red flag' tests, just so that we review the changes and make sure they are ok.