From d4214b3d57799a4f0a2e6c8a9b6645bc25dbecdd Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 14:29:05 +0100 Subject: [PATCH 1/6] =?UTF-8?q?A=20failing=20test,=20though=20I=20don?= =?UTF-8?q?=E2=80=99t=20think=20it=20could=20run=20in=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/org/unicode/jsp/UcdLoader.java | 2 +- .../java/org/unicode/jsptest/TestUnicodeSet.java | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java index 2deab491ff..b333e06875 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java @@ -21,7 +21,7 @@ public static synchronized VersionInfo getOldestLoadedUcd() { return oldestLoadedUcd; } - private static synchronized void setOldestLoadedUcd(VersionInfo v) { + public static synchronized void setOldestLoadedUcd(VersionInfo v) { if (v.compareTo(oldestLoadedUcd) < 0) { oldestLoadedUcd = v; } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a68acbc108..909cf793f6 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -33,11 +33,15 @@ import org.opentest4j.TestAbortedException; import org.unicode.jsp.CharEncoder; import org.unicode.jsp.Common; +import org.unicode.jsp.UcdLoader; import org.unicode.jsp.UnicodeJsp; import org.unicode.jsp.UnicodeSetUtilities; import org.unicode.jsp.UnicodeUtilities; import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UnicodeProperty; +import org.unicode.props.UcdPropertyValues.Age_Values; +import org.unicode.text.utility.Settings; public class TestUnicodeSet extends TestFmwk2 { @@ -141,6 +145,14 @@ public void TestPretty() { logln(derived); } + @Test + public void TestGeneralCategoryGroupings() { + IndexUnicodeProperties.useIncrementalProperties(); + UcdLoader.setOldestLoadedUcd(VersionInfo.UNICODE_10_0); + checkSetsEqual("[\\p{U10:Lu}\\p{U10:Ll}\\p{U10:Lm}\\p{U10:Lt}\\p{U10:Lo}]", "\\p{U10:L}"); + UcdLoader.setOldestLoadedUcd(Settings.LAST_VERSION_INFO); + } + // public void TestAExemplars() { // checkProperties("[:exemplars_en:]", "[a]", "[\u0350]"); // } From dfcd1f5c116a12468a913abe5b9db5d489d3f7ad Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 14:34:11 +0100 Subject: [PATCH 2/6] Fix the bug --- .../java/org/unicode/jsptest/TestUnicodeSet.java | 1 - .../org/unicode/props/IndexUnicodeProperties.java | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 909cf793f6..98c71c5139 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -40,7 +40,6 @@ import org.unicode.jsp.XPropertyFactory; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UnicodeProperty; -import org.unicode.props.UcdPropertyValues.Age_Values; import org.unicode.text.utility.Settings; public class TestUnicodeSet extends TestFmwk2 { diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index bdbd14e582..88f049bea0 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -795,14 +795,16 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { return super.getSet(matcher, result); } final long start = System.currentTimeMillis(); - final UnicodeSet baseSet = - baseVersionProperties.getProperty(prop).getSet(matcher, result); + final UnicodeSet baseSet = baseVersionProperties.getProperty(prop).getSet(matcher); final UnicodeSet matchingInThisVersion = super.getSet(matcher, null).retainAll(getDiffSet()); - result = - baseSet.addAll(matchingInThisVersion) - .removeAll( - getDiffSet().cloneAsThawed().removeAll(matchingInThisVersion)); + baseSet.addAll(matchingInThisVersion) + .removeAll(getDiffSet().cloneAsThawed().removeAll(matchingInThisVersion)); + if (result == null) { + result = baseSet; + } else { + result.addAll(baseSet); + } final long stop = System.currentTimeMillis(); final long Δt_in_ms = stop - start; if (Δt_in_ms > 100) { From 202b65f82b403cb6f2a77181c4452c3a29376d04 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 14:53:57 +0100 Subject: [PATCH 3/6] Combining_Mark --- .../org/unicode/jsp/UnicodeSetUtilities.java | 35 ++++++++++--------- .../org/unicode/jsptest/TestUnicodeSet.java | 8 ++++- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 34eed8b30d..23640e371f 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -9,6 +9,7 @@ import com.ibm.icu.util.ULocale; import com.ibm.icu.util.VersionInfo; import java.text.ParsePosition; +import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.regex.Pattern; @@ -221,15 +222,15 @@ public boolean applyPropertyAlias( return status; } - private static String[][] COARSE_GENERAL_CATEGORIES = { - {"Other", "C", "Cc", "Cf", "Cn", "Co", "Cs"}, - {"Letter", "L", "Ll", "Lm", "Lo", "Lt", "Lu"}, - {"Cased_Letter", "LC", "Ll", "Lt", "Lu"}, - {"Mark", "M", "Mc", "Me", "Mn"}, - {"Number", "N", "Nd", "Nl", "No"}, - {"Punctuation", "P", "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}, - {"Symbol", "S", "Sc", "Sk", "Sm", "So"}, - {"Separator", "Z", "Zl", "Zp", "Zs"}, + private static String[][][] COARSE_GENERAL_CATEGORIES = { + {{"Other", "C"}, {"Cc", "Cf", "Cn", "Co", "Cs"}}, + {{"Letter", "L"}, {"Ll", "Lm", "Lo", "Lt", "Lu"}}, + {{"Cased_Letter", "LC"}, {"Ll", "Lt", "Lu"}}, + {{"Mark", "M", "Combining_Mark"}, {"Mc", "Me", "Mn"}}, + {{"Number", "N"}, {"Nd", "Nl", "No"}}, + {{"Punctuation", "P"}, {"Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}}, + {{"Symbol", "S"}, {"Sc", "Sk", "Sm", "So"}}, + {{"Separator", "Z"}, {"Zl", "Zp", "Zs"}}, }; // TODO(eggrobin): I think this function only ever returns true; might as well make it void. @@ -304,13 +305,15 @@ private boolean applyPropertyAlias0( UnicodePropertySymbolTable::parseVersionInfoOrMax)); } else { if (prop.getName().equals("General_Category")) { - for (String[] coarseValue : COARSE_GENERAL_CATEGORIES) { - final String longName = coarseValue[0]; - final String shortName = coarseValue[1]; - if (UnicodeProperty.equalNames(propertyValue, longName) - || UnicodeProperty.equalNames(propertyValue, shortName)) { - for (int i = 2; i < coarseValue.length; ++i) { - prop.getSet(coarseValue[i], result); + for (String[][] coarseValue : COARSE_GENERAL_CATEGORIES) { + final String[] aliases = coarseValue[0]; + if (Arrays.stream(aliases) + .anyMatch( + a -> + UnicodeProperty.equalNames( + propertyValue, a))) { + for (var value : coarseValue[1]) { + prop.getSet(value, result); } return true; } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 98c71c5139..ca4cd9bb5d 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -145,13 +145,19 @@ public void TestPretty() { } @Test - public void TestGeneralCategoryGroupings() { + public void TestGeneralCategoryGroupingsWithIncrementalProperties() { IndexUnicodeProperties.useIncrementalProperties(); UcdLoader.setOldestLoadedUcd(VersionInfo.UNICODE_10_0); checkSetsEqual("[\\p{U10:Lu}\\p{U10:Ll}\\p{U10:Lm}\\p{U10:Lt}\\p{U10:Lo}]", "\\p{U10:L}"); UcdLoader.setOldestLoadedUcd(Settings.LAST_VERSION_INFO); } + @Test + public void TestGeneralCategoryGroupings() { + checkSetsEqual("[\\p{Lu}\\p{Ll}\\p{Lm}\\p{Lt}\\p{Lo}]", "\\p{L}"); + checkSetsEqual("[\\p{Mc}\\p{Me}\\p{Mn}]", "\\p{gc=Combining_Mark}"); + } + // public void TestAExemplars() { // checkProperties("[:exemplars_en:]", "[a]", "[\u0350]"); // } From bbb7f5a7fe09ce779fb04635166ad57df67de96b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 19:42:12 +0100 Subject: [PATCH 4/6] Gate it behind a flag until I figure out something smarter --- .../src/test/java/org/unicode/jsptest/TestUnicodeSet.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index ca4cd9bb5d..1c1a71c283 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -27,6 +27,7 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.jupiter.api.condition.EnabledIfSystemProperty; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -145,6 +146,10 @@ public void TestPretty() { } @Test + @EnabledIfSystemProperty( + named = "UNICODETOOLS_TEST_WITH_INCREMENTAL_PROPERTIES", + matches = ".*", + disabledReason = "Tests with incremental properties must be run separately") public void TestGeneralCategoryGroupingsWithIncrementalProperties() { IndexUnicodeProperties.useIncrementalProperties(); UcdLoader.setOldestLoadedUcd(VersionInfo.UNICODE_10_0); From 66acf97f0ccd9ebac39a580121e8ef51ab857455 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 20:12:14 +0100 Subject: [PATCH 5/6] Remove unused and wrong SUPER_CATEGORIES --- .../src/main/java/org/unicode/text/UCD/UCD_Names.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java index b035b9f9ca..edf1a21b4d 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java @@ -747,17 +747,6 @@ public final class UCD_Names implements UCD_Types { // usage) }; - static final String[][] SUPER_CATEGORIES = { - {"L", "Letter", null, "Ll | Lm | Lo | Lt | Lu"}, - {"M", "Mark", "Combining_Mark", "Mc | Me | Mn"}, - {"N", "Number", null, "Nd | Nl | No"}, - {"Z", "Separator", null, "Zl | Zp | Zs"}, - {"C", "Other", "cntrl", "Cc | Cf | Cn | Co | Cs"}, - {"S", "Symbol", null, "Sc | Sk | Sm | So"}, - {"P", "Punctuation", "punct", "Pc | Pd | Pe | Pf | Pi | Po | Ps"}, - {"LC", "Cased Letter", null, "Ll | Lt | Lu"}, - }; - public static final Relation EXTRA_GENERAL_CATEGORY = new Relation(new TreeMap>(), LinkedHashSet.class); From 038e6d37260e9c54b6b3fcb37ac64bb39b8eb423 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 12 Mar 2025 20:32:18 +0100 Subject: [PATCH 6/6] Aliases from PVA --- .../org/unicode/jsp/UnicodeSetUtilities.java | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index aabb737378..98266713ec 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -9,11 +9,12 @@ import com.ibm.icu.util.ULocale; import com.ibm.icu.util.VersionInfo; import java.text.ParsePosition; -import java.util.Arrays; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.regex.Pattern; import org.unicode.cldr.util.MultiComparator; +import org.unicode.props.UcdPropertyValues; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.PatternMatcher; import org.unicode.props.UnicodePropertySymbolTable; @@ -222,16 +223,25 @@ public boolean applyPropertyAlias( return status; } - private static String[][][] COARSE_GENERAL_CATEGORIES = { - {{"Other", "C"}, {"Cc", "Cf", "Cn", "Co", "Cs"}}, - {{"Letter", "L"}, {"Ll", "Lm", "Lo", "Lt", "Lu"}}, - {{"Cased_Letter", "LC"}, {"Ll", "Lt", "Lu"}}, - {{"Mark", "M", "Combining_Mark"}, {"Mc", "Me", "Mn"}}, - {{"Number", "N"}, {"Nd", "Nl", "No"}}, - {{"Punctuation", "P"}, {"Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}}, - {{"Symbol", "S"}, {"Sc", "Sk", "Sm", "So"}}, - {{"Separator", "Z"}, {"Zl", "Zp", "Zs"}}, - }; + private static Map + COARSE_GENERAL_CATEGORIES = + Map.of( + UcdPropertyValues.General_Category_Values.Other, + new String[] {"Cc", "Cf", "Cn", "Co", "Cs"}, + UcdPropertyValues.General_Category_Values.Letter, + new String[] {"Ll", "Lm", "Lo", "Lt", "Lu"}, + UcdPropertyValues.General_Category_Values.Cased_Letter, + new String[] {"Ll", "Lt", "Lu"}, + UcdPropertyValues.General_Category_Values.Mark, + new String[] {"Mc", "Me", "Mn"}, + UcdPropertyValues.General_Category_Values.Number, + new String[] {"Nd", "Nl", "No"}, + UcdPropertyValues.General_Category_Values.Punctuation, + new String[] {"Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}, + UcdPropertyValues.General_Category_Values.Symbol, + new String[] {"Sc", "Sk", "Sm", "So"}, + UcdPropertyValues.General_Category_Values.Separator, + new String[] {"Zl", "Zp", "Zs"}); // TODO(eggrobin): I think this function only ever returns true; might as well make it void. private boolean applyPropertyAlias0( @@ -315,14 +325,14 @@ private boolean applyPropertyAlias0( UnicodePropertySymbolTable::parseVersionInfoOrMax)); } else { if (prop.getName().equals("General_Category")) { - for (String[][] coarseValue : COARSE_GENERAL_CATEGORIES) { - final String[] aliases = coarseValue[0]; - if (Arrays.stream(aliases) + for (var entry : COARSE_GENERAL_CATEGORIES.entrySet()) { + final var aliases = entry.getKey().getNames().getAllNames(); + if (aliases.stream() .anyMatch( a -> UnicodeProperty.equalNames( propertyValue, a))) { - for (var value : coarseValue[1]) { + for (var value : entry.getValue()) { prop.getSet(value, result); } return true;