Split addition comparisons (#942)

eggrobin · web-flow · commit 5b4d24fa4454 · 2024-10-02T10:21:45.000+02:00
* Improvements to the invariant test language

* Split the addition comparison test files
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -563,13 +563,11 @@ private static void propertywiseCorrespondenceLine(
         final List<String> errorMessageLines = new ArrayList<>();
         final List<UnicodeSet> sets = new ArrayList<>();
         sets.add(firstSet);
-        expectToken(":", pp, source);
 
-        // Index of the first set of multi-character strings (and of the first multi-character
-        // reference string).
+        // Index of the first set of value-only sets (prefixed by ⧴ rather than :).
+        // Only value-only sets may contain multi-character strings.
         // This is `m` in the documentation in UnicodeInvariantTest.txt.
-        int firstMultiCharacterIndex = -1;
-        do {
+        while (Lookahead.oneToken(pp, source).accept(":")) {
             final var set = parseUnicodeSet(source, pp);
             if (set.size() != firstSet.size()) {
                 throw new BackwardParseException(
@@ -580,24 +578,29 @@ private static void propertywiseCorrespondenceLine(
                                 + ")",
                         pp.getIndex());
             }
-            if (set.hasStrings() && set.strings().size() != set.size()) {
+            if (set.hasStrings()) {
                 throw new BackwardParseException(
-                        "Sets should be all strings or all code points for property correspondence",
+                        "Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
                         pp.getIndex());
             }
-            if (firstMultiCharacterIndex == -1) {
-                if (set.hasStrings()) {
-                    firstMultiCharacterIndex = sets.size();
-                }
-            } else if (!set.hasStrings()) {
+            sets.add(set);
+        }
+        // Index of the first set of value-only sets (prefixed by ⧴ rather than :).
+        // Only value-only sets may contain multi-character strings.
+        // This is `m` in the documentation in UnicodeInvariantTest.txt.
+        final int firstValueOnlyIndex = sets.size();
+        while (Lookahead.oneToken(pp, source).accept("⧴")) {
+            final var set = parseUnicodeSet(source, pp);
+            if (set.size() != firstSet.size()) {
                 throw new BackwardParseException(
-                        "Code points should come before strings in property correspondence",
+                        "Sets should have the same size for property correspondence (got "
+                                + set.size()
+                                + ", expected "
+                                + firstSet.size()
+                                + ")",
                         pp.getIndex());
             }
             sets.add(set);
-        } while (Lookahead.oneToken(pp, source).accept(":"));
-        if (firstMultiCharacterIndex == -1) {
-            firstMultiCharacterIndex = sets.size();
         }
         final List<String> referenceCodePoints = new ArrayList<>();
         expectToken("CorrespondTo", pp, source);
@@ -608,14 +611,14 @@ private static void propertywiseCorrespondenceLine(
                         "reference should be a single code point or string for property correspondence",
                         pp.getIndex());
             }
-            if (referenceSet.hasStrings()
-                    != (referenceCodePoints.size() >= firstMultiCharacterIndex)) {
+            if (referenceSet.hasStrings() && referenceCodePoints.size() < firstValueOnlyIndex) {
                 throw new BackwardParseException(
-                        "Strings should correspond to strings for property correspondence",
+                        "Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
                         pp.getIndex());
             }
             referenceCodePoints.add(referenceSet.iterator().next());
-        } while (Lookahead.oneToken(pp, source).accept(":"));
+        } while (Lookahead.oneToken(pp, source)
+                .accept(referenceCodePoints.size() >= firstValueOnlyIndex ? "⧴" : ":"));
         if (referenceCodePoints.size() != sets.size()) {
             throw new BackwardParseException(
                     "Property correspondence requires as many reference code points as sets under test",
@@ -638,8 +641,14 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                 String property = Lookahead.oneToken(pp, source).consume();
                 expectToken("(", pp, source);
                 String actualValueAlias = Lookahead.oneToken(pp, source).consume();
+                while (Lookahead.oneToken(pp, source).accept("|")) {
+                    actualValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
+                }
                 expectToken("vs", pp, source);
                 String referenceValueAlias = Lookahead.oneToken(pp, source).consume();
+                while (Lookahead.oneToken(pp, source).accept("|")) {
+                    referenceValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
+                }
                 expectToken(")", pp, source);
                 expectedPropertyDifferences.put(
                         property,
@@ -657,7 +666,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                 expectedDifference = expectedPropertyDifferences.get(alias);
             }
             if (expectedDifference != null) {
-                for (int k = 0; k < firstMultiCharacterIndex; ++k) {
+                for (int k = 0; k < firstValueOnlyIndex; ++k) {
                     final int rk = referenceCodePoints.get(k).codePointAt(0);
                     final String pRk = property.getValue(rk);
                     if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) {
@@ -687,7 +696,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                     }
                 }
             } else {
-                for (int k = 0; k < firstMultiCharacterIndex; ++k) {
+                for (int k = 0; k < firstValueOnlyIndex; ++k) {
                     final UnicodeSet set = sets.get(k);
                     final int rk = referenceCodePoints.get(k).codePointAt(0);
                     final String pRk = property.getValue(rk);
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/019.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/019.txt
@@ -0,0 +1,10 @@
+Ignoring Name Age:
+
+# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank,
+# it is indistinguishable from other Khitan Small Script characters.  See L2/23-065.
+# In particular, it is ideographic:
+# https://www.unicode.org/review/pri497/feedback.html#ID20240216140104.
+Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF}
+              \N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike
+
+end Ignoring;
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/030.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/030.txt
@@ -0,0 +1,12 @@
+Ignoring Name Age:
+
+# Garay is a right-to-left cased script:
+Propertywise [\N{GARAY SMALL LETTER A} - \N{GARAY SMALL LETTER OLD NA}]
+           : [\N{GARAY CAPITAL LETTER A} - \N{GARAY CAPITAL LETTER OLD NA}]
+CorrespondTo [\N{OLD HUNGARIAN SMALL LETTER A}]
+           : [\N{OLD HUNGARIAN CAPITAL LETTER A}]
+    UpTo: Block             (Garay vs Old_Hungarian),
+          Script            (Garay vs Old_Hungarian),
+          Script_Extensions (Garay vs Old_Hungarian)
+
+end Ignoring;
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/070.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/070.txt
@@ -0,0 +1,9 @@
+Ignoring Name Age:
+
+# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes.
+# In particular, they are scx=Hani:
+# https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
+Propertywise [\N{CJK STROKE HXG}\N{CJK STROKE SZP}
+              \N{CJK STROKE T}] AreAlike
+
+end Ignoring;
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/readme.md b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/readme.md
@@ -0,0 +1,10 @@
+# Property comparison tests
+
+Files in this directory are named after [RMG](https://github.com/unicode-org/utc-release-management)
+pipeline issues.
+Each file contains the tests comparing the properties of proposed characters to properties of
+pre-existing characters, developed as part of PAG review.
+
+Property comparison tests were not in place when properties were initially assigned for the 16.0
+répertoire; some have been retroactively created.  Comments in those files note feedback on errors
+that would have been caught by the tests.
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/template.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons/template.txt
@@ -0,0 +1,16 @@
+# [Template for property comparison tests of character encoding proposals]
+# [RMG ISSUE TITLE]
+# https://github.com/unicode-org/utc-release-management/issues/[RMG ISSUE NUMBER]
+
+# Names always differ.
+# Age always differs since these tests are comparing additions to pre-existing characters.
+Ignoring Name Age:
+
+# Ignore the security and IDNA properties, as these are not yet included for provisionally assigned characters.
+Ignoring Confusable_MA Identifier_Status Identifier_Type Idn_Status Idn_Mapping Idn_2008:
+
+# [TEST GOES HERE]
+
+end Ignoring;
+
+end Ignoring;
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -143,17 +143,15 @@
             end Ignoring;
 #
 ##########################
-# Propertywise <S₁> : ... : <Sₙ>
-# CorrespondTo <R₁> : ... : <Rₙ>
+# Propertywise <S₁> : ... <Sₘ> [ ⧴ <Sₘ₊₁> ⧴ ... ⧴ <Sₙ> ]
+# CorrespondTo <R₁> : ... <Rₘ> [ ⧴ <Rₘ₊₁> ⧴ ... ⧴ <Rₙ> ]
 # [   UpTo: <Property> (<SValue> vs <RValue>) {, <Property> (<SValue> vs <RValue>) }]
 #
 # The Sₖ must be Unicode sets of equal size, either with no strings or only strings.
 # They are considered in code point order for the correspondence check (item 2 below).
 # The references Rₖ must be Unicode sets each containing a single code point or a single string;
 # by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below.
-# For some m in 2 .. n, the following must hold:
-# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and
-# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and
+# For k ≤ m, Rₖ must be a code point and Sₖ must contain only code points.
 # For every non-ignored property P that does not appear in the optional UpTo clause,
 # checks that for each k in 1 .. m, for the ith character C in Sₖ, either:
 # 1. P(C) = P(Rₖ), or
@@ -163,6 +161,9 @@
 # For every non-ignored property P that appears in the UpTo clause, checks all characters in the
 # sets Sₖ have the SValue and all R characters have the RValue.
 #
+# Note that only the properties of the characters in Sₖ and Rₖ where k ≤ m are inspected; in other
+# words, the characters and strings prefixed by ⧴ are only considered as property values.
+#
 # With n=1 this check is equivalent to the more straightforward AreAlike check; however, it also
 # allows for testing of properties such as case mappings, which differ for most characters in a
 # script, but behave regularly.  See the examples below.
@@ -1369,8 +1370,8 @@ Ignoring Unicode_1_Name Confusable_MA:
         CorrespondTo [ⁱ] : [i] : [I]
     end Ignoring;
 
-    Propertywise [ゟ] : [{より}]
-    CorrespondTo [ヿ] : [{コト}]
+    Propertywise [ゟ] ⧴ [{より}]
+    CorrespondTo [ヿ] ⧴ [{コト}]
         UpTo: Block             (Hiragana vs Katakana),
               Script            (Hiragana vs Katakana),
               Script_Extensions (Hiragana vs Katakana),
diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java
@@ -5,6 +5,7 @@
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
+import java.io.File;
 import java.io.IOException;
 import java.text.ParseException;
 import java.text.ParsePosition;
@@ -41,9 +42,20 @@ void testUnicodeInvariants() throws IOException {
 
     @Test
     void testAdditionComparisons() throws IOException {
-        int rc =
-                TestUnicodeInvariants.testInvariants(
-                        "AdditionComparisons.txt", "addition-comparisons", true);
+        final var directory = new File(Settings.SRC_DIR + "UCD/AdditionComparisons/");
+        int rc = 0;
+        for (var file : directory.listFiles()) {
+            final String filename = file.getName();
+            if (!file.getName().endsWith(".txt")) {
+                continue;
+            }
+            final String nameWithoutExtension = filename.substring(0, filename.length() - 4);
+            rc +=
+                    TestUnicodeInvariants.testInvariants(
+                            "AdditionComparisons/" + filename,
+                            "addition-comparisons-" + nameWithoutExtension,
+                            true);
+        }
         assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed");
     }