diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java index 2deab491ff..b333e06875 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UcdLoader.java @@ -21,7 +21,7 @@ public static synchronized VersionInfo getOldestLoadedUcd() { return oldestLoadedUcd; } - private static synchronized void setOldestLoadedUcd(VersionInfo v) { + public static synchronized void setOldestLoadedUcd(VersionInfo v) { if (v.compareTo(oldestLoadedUcd) < 0) { oldestLoadedUcd = v; } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index e4f323a3db..98266713ec 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -11,8 +11,10 @@ import java.text.ParsePosition; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.regex.Pattern; import org.unicode.cldr.util.MultiComparator; +import org.unicode.props.UcdPropertyValues; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.PatternMatcher; import org.unicode.props.UnicodePropertySymbolTable; @@ -221,16 +223,25 @@ public boolean applyPropertyAlias( return status; } - private static String[][] COARSE_GENERAL_CATEGORIES = { - {"Other", "C", "Cc", "Cf", "Cn", "Co", "Cs"}, - {"Letter", "L", "Ll", "Lm", "Lo", "Lt", "Lu"}, - {"Cased_Letter", "LC", "Ll", "Lt", "Lu"}, - {"Mark", "M", "Mc", "Me", "Mn"}, - {"Number", "N", "Nd", "Nl", "No"}, - {"Punctuation", "P", "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}, - {"Symbol", "S", "Sc", "Sk", "Sm", "So"}, - {"Separator", "Z", "Zl", "Zp", "Zs"}, - }; + private static Map + COARSE_GENERAL_CATEGORIES = + Map.of( + UcdPropertyValues.General_Category_Values.Other, + new String[] {"Cc", "Cf", "Cn", "Co", "Cs"}, + UcdPropertyValues.General_Category_Values.Letter, + new String[] {"Ll", "Lm", "Lo", "Lt", "Lu"}, + UcdPropertyValues.General_Category_Values.Cased_Letter, + new String[] {"Ll", "Lt", "Lu"}, + UcdPropertyValues.General_Category_Values.Mark, + new String[] {"Mc", "Me", "Mn"}, + UcdPropertyValues.General_Category_Values.Number, + new String[] {"Nd", "Nl", "No"}, + UcdPropertyValues.General_Category_Values.Punctuation, + new String[] {"Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"}, + UcdPropertyValues.General_Category_Values.Symbol, + new String[] {"Sc", "Sk", "Sm", "So"}, + UcdPropertyValues.General_Category_Values.Separator, + new String[] {"Zl", "Zp", "Zs"}); // TODO(eggrobin): I think this function only ever returns true; might as well make it void. private boolean applyPropertyAlias0( @@ -314,13 +325,15 @@ private boolean applyPropertyAlias0( UnicodePropertySymbolTable::parseVersionInfoOrMax)); } else { if (prop.getName().equals("General_Category")) { - for (String[] coarseValue : COARSE_GENERAL_CATEGORIES) { - final String longName = coarseValue[0]; - final String shortName = coarseValue[1]; - if (UnicodeProperty.equalNames(propertyValue, longName) - || UnicodeProperty.equalNames(propertyValue, shortName)) { - for (int i = 2; i < coarseValue.length; ++i) { - prop.getSet(coarseValue[i], result); + for (var entry : COARSE_GENERAL_CATEGORIES.entrySet()) { + final var aliases = entry.getKey().getNames().getAllNames(); + if (aliases.stream() + .anyMatch( + a -> + UnicodeProperty.equalNames( + propertyValue, a))) { + for (var value : entry.getValue()) { + prop.getSet(value, result); } return true; } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index a47d607897..168fe98bd5 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -27,17 +27,21 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.jupiter.api.condition.EnabledIfSystemProperty; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; import org.opentest4j.TestAbortedException; import org.unicode.jsp.CharEncoder; import org.unicode.jsp.Common; +import org.unicode.jsp.UcdLoader; import org.unicode.jsp.UnicodeJsp; import org.unicode.jsp.UnicodeSetUtilities; import org.unicode.jsp.UnicodeUtilities; import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UnicodeProperty; +import org.unicode.text.utility.Settings; public class TestUnicodeSet extends TestFmwk2 { @@ -141,6 +145,24 @@ public void TestPretty() { logln(derived); } + @Test + @EnabledIfSystemProperty( + named = "UNICODETOOLS_TEST_WITH_INCREMENTAL_PROPERTIES", + matches = ".*", + disabledReason = "Tests with incremental properties must be run separately") + public void TestGeneralCategoryGroupingsWithIncrementalProperties() { + IndexUnicodeProperties.useIncrementalProperties(); + UcdLoader.setOldestLoadedUcd(VersionInfo.UNICODE_10_0); + checkSetsEqual("[\\p{U10:Lu}\\p{U10:Ll}\\p{U10:Lm}\\p{U10:Lt}\\p{U10:Lo}]", "\\p{U10:L}"); + UcdLoader.setOldestLoadedUcd(Settings.LAST_VERSION_INFO); + } + + @Test + public void TestGeneralCategoryGroupings() { + checkSetsEqual("[\\p{Lu}\\p{Ll}\\p{Lm}\\p{Lt}\\p{Lo}]", "\\p{L}"); + checkSetsEqual("[\\p{Mc}\\p{Me}\\p{Mn}]", "\\p{gc=Combining_Mark}"); + } + @Test public void TestInteriorlyNegatedComparison() { checkProperties("\\p{Uppercase≠@Changes_When_Lowercased@}", "[𝕬-𝖅]"); diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index bdbd14e582..88f049bea0 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -795,14 +795,16 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { return super.getSet(matcher, result); } final long start = System.currentTimeMillis(); - final UnicodeSet baseSet = - baseVersionProperties.getProperty(prop).getSet(matcher, result); + final UnicodeSet baseSet = baseVersionProperties.getProperty(prop).getSet(matcher); final UnicodeSet matchingInThisVersion = super.getSet(matcher, null).retainAll(getDiffSet()); - result = - baseSet.addAll(matchingInThisVersion) - .removeAll( - getDiffSet().cloneAsThawed().removeAll(matchingInThisVersion)); + baseSet.addAll(matchingInThisVersion) + .removeAll(getDiffSet().cloneAsThawed().removeAll(matchingInThisVersion)); + if (result == null) { + result = baseSet; + } else { + result.addAll(baseSet); + } final long stop = System.currentTimeMillis(); final long Δt_in_ms = stop - start; if (Δt_in_ms > 100) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java index b035b9f9ca..edf1a21b4d 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD_Names.java @@ -747,17 +747,6 @@ public final class UCD_Names implements UCD_Types { // usage) }; - static final String[][] SUPER_CATEGORIES = { - {"L", "Letter", null, "Ll | Lm | Lo | Lt | Lu"}, - {"M", "Mark", "Combining_Mark", "Mc | Me | Mn"}, - {"N", "Number", null, "Nd | Nl | No"}, - {"Z", "Separator", null, "Zl | Zp | Zs"}, - {"C", "Other", "cntrl", "Cc | Cf | Cn | Co | Cs"}, - {"S", "Symbol", null, "Sc | Sk | Sm | So"}, - {"P", "Punctuation", "punct", "Pc | Pd | Pe | Pf | Pi | Po | Ps"}, - {"LC", "Cased Letter", null, "Ll | Lt | Lu"}, - }; - public static final Relation EXTRA_GENERAL_CATEGORY = new Relation(new TreeMap>(), LinkedHashSet.class);