Skip to content

Commit 5b4d24f

Browse files
authored
Split addition comparisons (#942)
* Improvements to the invariant test language * Split the addition comparison test files
1 parent 0235209 commit 5b4d24f

File tree

9 files changed

+111
-74
lines changed

9 files changed

+111
-74
lines changed

unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -563,13 +563,11 @@ private static void propertywiseCorrespondenceLine(
563563
final List<String> errorMessageLines = new ArrayList<>();
564564
final List<UnicodeSet> sets = new ArrayList<>();
565565
sets.add(firstSet);
566-
expectToken(":", pp, source);
567566

568-
// Index of the first set of multi-character strings (and of the first multi-character
569-
// reference string).
567+
// Index of the first set of value-only sets (prefixed by ⧴ rather than :).
568+
// Only value-only sets may contain multi-character strings.
570569
// This is `m` in the documentation in UnicodeInvariantTest.txt.
571-
int firstMultiCharacterIndex = -1;
572-
do {
570+
while (Lookahead.oneToken(pp, source).accept(":")) {
573571
final var set = parseUnicodeSet(source, pp);
574572
if (set.size() != firstSet.size()) {
575573
throw new BackwardParseException(
@@ -580,24 +578,29 @@ private static void propertywiseCorrespondenceLine(
580578
+ ")",
581579
pp.getIndex());
582580
}
583-
if (set.hasStrings() && set.strings().size() != set.size()) {
581+
if (set.hasStrings()) {
584582
throw new BackwardParseException(
585-
"Sets should be all strings or all code points for property correspondence",
583+
"Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
586584
pp.getIndex());
587585
}
588-
if (firstMultiCharacterIndex == -1) {
589-
if (set.hasStrings()) {
590-
firstMultiCharacterIndex = sets.size();
591-
}
592-
} else if (!set.hasStrings()) {
586+
sets.add(set);
587+
}
588+
// Index of the first set of value-only sets (prefixed by ⧴ rather than :).
589+
// Only value-only sets may contain multi-character strings.
590+
// This is `m` in the documentation in UnicodeInvariantTest.txt.
591+
final int firstValueOnlyIndex = sets.size();
592+
while (Lookahead.oneToken(pp, source).accept("⧴")) {
593+
final var set = parseUnicodeSet(source, pp);
594+
if (set.size() != firstSet.size()) {
593595
throw new BackwardParseException(
594-
"Code points should come before strings in property correspondence",
596+
"Sets should have the same size for property correspondence (got "
597+
+ set.size()
598+
+ ", expected "
599+
+ firstSet.size()
600+
+ ")",
595601
pp.getIndex());
596602
}
597603
sets.add(set);
598-
} while (Lookahead.oneToken(pp, source).accept(":"));
599-
if (firstMultiCharacterIndex == -1) {
600-
firstMultiCharacterIndex = sets.size();
601604
}
602605
final List<String> referenceCodePoints = new ArrayList<>();
603606
expectToken("CorrespondTo", pp, source);
@@ -608,14 +611,14 @@ private static void propertywiseCorrespondenceLine(
608611
"reference should be a single code point or string for property correspondence",
609612
pp.getIndex());
610613
}
611-
if (referenceSet.hasStrings()
612-
!= (referenceCodePoints.size() >= firstMultiCharacterIndex)) {
614+
if (referenceSet.hasStrings() && referenceCodePoints.size() < firstValueOnlyIndex) {
613615
throw new BackwardParseException(
614-
"Strings should correspond to strings for property correspondence",
616+
"Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
615617
pp.getIndex());
616618
}
617619
referenceCodePoints.add(referenceSet.iterator().next());
618-
} while (Lookahead.oneToken(pp, source).accept(":"));
620+
} while (Lookahead.oneToken(pp, source)
621+
.accept(referenceCodePoints.size() >= firstValueOnlyIndex ? "⧴" : ":"));
619622
if (referenceCodePoints.size() != sets.size()) {
620623
throw new BackwardParseException(
621624
"Property correspondence requires as many reference code points as sets under test",
@@ -638,8 +641,14 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
638641
String property = Lookahead.oneToken(pp, source).consume();
639642
expectToken("(", pp, source);
640643
String actualValueAlias = Lookahead.oneToken(pp, source).consume();
644+
while (Lookahead.oneToken(pp, source).accept("|")) {
645+
actualValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
646+
}
641647
expectToken("vs", pp, source);
642648
String referenceValueAlias = Lookahead.oneToken(pp, source).consume();
649+
while (Lookahead.oneToken(pp, source).accept("|")) {
650+
referenceValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
651+
}
643652
expectToken(")", pp, source);
644653
expectedPropertyDifferences.put(
645654
property,
@@ -657,7 +666,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
657666
expectedDifference = expectedPropertyDifferences.get(alias);
658667
}
659668
if (expectedDifference != null) {
660-
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
669+
for (int k = 0; k < firstValueOnlyIndex; ++k) {
661670
final int rk = referenceCodePoints.get(k).codePointAt(0);
662671
final String pRk = property.getValue(rk);
663672
if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) {
@@ -687,7 +696,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
687696
}
688697
}
689698
} else {
690-
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
699+
for (int k = 0; k < firstValueOnlyIndex; ++k) {
691700
final UnicodeSet set = sets.get(k);
692701
final int rk = referenceCodePoints.get(k).codePointAt(0);
693702
final String pRk = property.getValue(rk);

unicodetools/src/main/resources/org/unicode/text/UCD/AdditionComparisons.txt

Lines changed: 0 additions & 42 deletions
This file was deleted.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Ignoring Name Age:
2+
3+
# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank,
4+
# it is indistinguishable from other Khitan Small Script characters. See L2/23-065.
5+
# In particular, it is ideographic:
6+
# https://www.unicode.org/review/pri497/feedback.html#ID20240216140104.
7+
Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF}
8+
\N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike
9+
10+
end Ignoring;
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Ignoring Name Age:
2+
3+
# Garay is a right-to-left cased script:
4+
Propertywise [\N{GARAY SMALL LETTER A} - \N{GARAY SMALL LETTER OLD NA}]
5+
: [\N{GARAY CAPITAL LETTER A} - \N{GARAY CAPITAL LETTER OLD NA}]
6+
CorrespondTo [\N{OLD HUNGARIAN SMALL LETTER A}]
7+
: [\N{OLD HUNGARIAN CAPITAL LETTER A}]
8+
UpTo: Block (Garay vs Old_Hungarian),
9+
Script (Garay vs Old_Hungarian),
10+
Script_Extensions (Garay vs Old_Hungarian)
11+
12+
end Ignoring;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Ignoring Name Age:
2+
3+
# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes.
4+
# In particular, they are scx=Hani:
5+
# https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
6+
Propertywise [\N{CJK STROKE HXG}\N{CJK STROKE SZP}
7+
\N{CJK STROKE T}] AreAlike
8+
9+
end Ignoring;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Property comparison tests
2+
3+
Files in this directory are named after [RMG](https://github.com/unicode-org/utc-release-management)
4+
pipeline issues.
5+
Each file contains the tests comparing the properties of proposed characters to properties of
6+
pre-existing characters, developed as part of PAG review.
7+
8+
Property comparison tests were not in place when properties were initially assigned for the 16.0
9+
répertoire; some have been retroactively created. Comments in those files note feedback on errors
10+
that would have been caught by the tests.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# [Template for property comparison tests of character encoding proposals]
2+
# [RMG ISSUE TITLE]
3+
# https://github.com/unicode-org/utc-release-management/issues/[RMG ISSUE NUMBER]
4+
5+
# Names always differ.
6+
# Age always differs since these tests are comparing additions to pre-existing characters.
7+
Ignoring Name Age:
8+
9+
# Ignore the security and IDNA properties, as these are not yet included for provisionally assigned characters.
10+
Ignoring Confusable_MA Identifier_Status Identifier_Type Idn_Status Idn_Mapping Idn_2008:
11+
12+
# [TEST GOES HERE]
13+
14+
end Ignoring;
15+
16+
end Ignoring;

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,17 +143,15 @@
143143
end Ignoring;
144144
#
145145
##########################
146-
# Propertywise <S₁> : ... : <Sₙ>
147-
# CorrespondTo <R₁> : ... : <Rₙ>
146+
# Propertywise <S₁> : ... <Sₘ> [ ⧴ <Sₘ₊₁> ⧴ ... ⧴ <Sₙ> ]
147+
# CorrespondTo <R₁> : ... <Rₘ> [ ⧴ <Rₘ₊₁> ⧴ ... ⧴ <Rₙ> ]
148148
# [ UpTo: <Property> (<SValue> vs <RValue>) {, <Property> (<SValue> vs <RValue>) }]
149149
#
150150
# The Sₖ must be Unicode sets of equal size, either with no strings or only strings.
151151
# They are considered in code point order for the correspondence check (item 2 below).
152152
# The references Rₖ must be Unicode sets each containing a single code point or a single string;
153153
# by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below.
154-
# For some m in 2 .. n, the following must hold:
155-
# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and
156-
# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and
154+
# For k ≤ m, Rₖ must be a code point and Sₖ must contain only code points.
157155
# For every non-ignored property P that does not appear in the optional UpTo clause,
158156
# checks that for each k in 1 .. m, for the ith character C in Sₖ, either:
159157
# 1. P(C) = P(Rₖ), or
@@ -163,6 +161,9 @@
163161
# For every non-ignored property P that appears in the UpTo clause, checks all characters in the
164162
# sets Sₖ have the SValue and all R characters have the RValue.
165163
#
164+
# Note that only the properties of the characters in Sₖ and Rₖ where k ≤ m are inspected; in other
165+
# words, the characters and strings prefixed by ⧴ are only considered as property values.
166+
#
166167
# With n=1 this check is equivalent to the more straightforward AreAlike check; however, it also
167168
# allows for testing of properties such as case mappings, which differ for most characters in a
168169
# script, but behave regularly. See the examples below.
@@ -1369,8 +1370,8 @@ Ignoring Unicode_1_Name Confusable_MA:
13691370
CorrespondTo [ⁱ] : [i] : [I]
13701371
end Ignoring;
13711372

1372-
Propertywise [ゟ] : [{より}]
1373-
CorrespondTo [ヿ] : [{コト}]
1373+
Propertywise [ゟ] [{より}]
1374+
CorrespondTo [ヿ] [{コト}]
13741375
UpTo: Block (Hiragana vs Katakana),
13751376
Script (Hiragana vs Katakana),
13761377
Script_Extensions (Hiragana vs Katakana),

unicodetools/src/test/java/org/unicode/text/UCD/TestTestUnicodeInvariants.java

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import static org.junit.jupiter.api.Assertions.assertNotNull;
66
import static org.junit.jupiter.api.Assertions.assertThrows;
77

8+
import java.io.File;
89
import java.io.IOException;
910
import java.text.ParseException;
1011
import java.text.ParsePosition;
@@ -41,9 +42,20 @@ void testUnicodeInvariants() throws IOException {
4142

4243
@Test
4344
void testAdditionComparisons() throws IOException {
44-
int rc =
45-
TestUnicodeInvariants.testInvariants(
46-
"AdditionComparisons.txt", "addition-comparisons", true);
45+
final var directory = new File(Settings.SRC_DIR + "UCD/AdditionComparisons/");
46+
int rc = 0;
47+
for (var file : directory.listFiles()) {
48+
final String filename = file.getName();
49+
if (!file.getName().endsWith(".txt")) {
50+
continue;
51+
}
52+
final String nameWithoutExtension = filename.substring(0, filename.length() - 4);
53+
rc +=
54+
TestUnicodeInvariants.testInvariants(
55+
"AdditionComparisons/" + filename,
56+
"addition-comparisons-" + nameWithoutExtension,
57+
true);
58+
}
4759
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed");
4860
}
4961

0 commit comments

Comments
 (0)