diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 98266713ec..6e3e40f969 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -14,6 +14,8 @@ import java.util.Map; import java.util.regex.Pattern; import org.unicode.cldr.util.MultiComparator; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodeProperty.PatternMatcher; @@ -340,6 +342,15 @@ private boolean applyPropertyAlias0( } } set = prop.getSet(propertyValue); + if (set.isEmpty() + && prop instanceof IndexUnicodeProperties.IndexUnicodeProperty + && prop.getName().equals("Name")) { + set = + ((IndexUnicodeProperties.IndexUnicodeProperty) prop) + .getFactory() + .getProperty(UcdProperty.Name_Alias) + .getSet(propertyValue); + } } } else if (isAge) { set = new UnicodeSet(); diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index 168fe98bd5..a87f07ef89 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -175,6 +175,53 @@ public void TestInteriorlyNegatedComparison() { "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]"); } + @Test + public void TestNameMatching() { + // UAX44-LM2 for both Name and Name_Alias. + checkSetsEqual("\\p{Name=NO-BREAK SPACE}", "[\\xA0]"); + checkSetsEqual("\\p{Name=no break space}", "[\\xA0]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]"); + checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]"); + checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -E}", "[\\u1180]"); + checkSetsEqual("\\p{Name= HANGUL JUNGSEONG O-E }", "[\\u1180]"); + checkSetsEqual("\\p{Name=_HANGUL_JUNGSEONG_O-E_}", "[\\u1180]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-EO}", "[\\u117F]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE O}", "[\\u117F]"); + checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -EO}", "[]"); + checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]"); + checkSetsEqual("\\p{Name=MARCHEN_LETTER_-A}", "[\\x{11C88}]"); + checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]"); + checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]"); + checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]"); + checkSetsEqual("\\p{Name=TIBETAN MARK BKA- SHOG YIG MGO}", "[\\u0F0A]"); + checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]"); + checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]"); + checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]"); + // The medial hyphen is only significant in HANGUL JUNGSEONG O-E, not in arbitrary O-E/OE. + checkSetsEqual("\\p{Name=twoemdash}", "⸺"); + checkSetsEqual("\\p{Name=SeeNoEvil_Monkey}", "🙈"); + checkSetsEqual("\\p{Name=BALLET S-H-O-E-S}", "🩰"); + checkSetsEqual("[\\p{Name=LATIN SMALL LIGATURE O-E}uf]", "[œuf]"); + } + + @Test + public void TestNameAliases() { + // Name_Alias values behave as aliases for Name, but not vice-versa. + checkSetsEqual( + "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]"); + checkSetsEqual( + "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]"); + checkSetsEqual( + "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", + "[]"); + checkSetsEqual( + "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", + "[︘]"); + checkProperties("\\p{Name_Alias=@none@}", "[a-z]"); + } + @Test public void TestIdentityQuery() { checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}"); diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index 88f049bea0..241cb7b921 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -196,6 +196,7 @@ public Map getCacheFileSize() { static final Transform fromNumericPinyin = Transliterator.getInstance("NumericPinyin-Latin;nfc"); + static final Merge MULTIVALUED_JOINER = new PropertyUtilities.Joiner("|"); static final Merge ALPHABETIC_JOINER = new Merge() { TreeSet sorted = new TreeSet(); @@ -684,7 +685,7 @@ public VersionInfo getUcdVersion() { // .get(toSkeleton(propertyAlias)); // } - class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { + public class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { private final UcdProperty prop; private final Map stringToNamedEnum; @@ -724,6 +725,10 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty { } } + public IndexUnicodeProperties getFactory() { + return IndexUnicodeProperties.this; + } + @Override public boolean isTrivial() { return _getRawUnicodeMap().isEmpty() diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index 5e60f04271..5b08551d36 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -884,7 +884,7 @@ private static void parsePropertyValueFile( && indexUnicodeProperties.ucdVersion.compareTo( VersionInfo.UNICODE_4_0) <= 0 - ? new PropertyUtilities.Joiner("|") + ? IndexUnicodeProperties.MULTIVALUED_JOINER : null; final var originalMultivaluedSplit = propInfo.multivaluedSplit; // The first version of kPrimaryNumeric had spaces in values. @@ -995,7 +995,7 @@ private static void parseNameAliasesFile( indexUnicodeProperties, nextProperties, propInfoSet, - IndexUnicodeProperties.ALPHABETIC_JOINER, + IndexUnicodeProperties.MULTIVALUED_JOINER, false); } } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index ee1ef259b9..e54761b123 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -448,7 +448,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { ? NULL_MATCHER : new SimpleMatcher( propertyValue, - isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), + getName().equals("Name") || getName().equals("Name_Alias") + ? CHARACTER_NAME_COMPARATOR + : isType(STRING_OR_MISC_MASK) + ? null + : PROPERTY_COMPARATOR), result); } } @@ -720,39 +724,83 @@ public static String toSkeleton(String source) { return skeletonBuffer.toString(); } - /** Returns a representative of the equivalence class of source under UAX44-LM2. */ - public static String toNameSkeleton(String source) { + public static final Comparator CHARACTER_NAME_COMPARATOR = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return compareCharacterNames(o1, o2); + } + }; + + public static int compareCharacterNames(String a, String b) { + if (a == b) return 0; + if (a == null) return -1; + if (b == null) return 1; + return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false)); + } + + /** + * Returns a representative of the equivalence class of source under UAX44-LM2. If + * validate=true, checks that source contains only characters allowed in character names. + */ + public static String toNameSkeleton(String source, boolean validate) { if (source == null) return null; - StringBuffer result = new StringBuffer(); + StringBuilder result = new StringBuilder(); // remove spaces, medial '-' // we can do this with char, since no surrogates are involved for (int i = 0; i < source.length(); ++i) { char ch = source.charAt(i); + final char uppercase = Character.toUpperCase(ch); + if (validate && uppercase != ch) { + throw new IllegalArgumentException( + "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); + } + ch = uppercase; if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') { result.append(ch); } else if (ch == ' ') { // don't copy ever } else if (ch == '-') { - // only copy non-medials AND trailing O-E - if (0 == i - || i == source.length() - 1 - || source.charAt(i - 1) == ' ' - || source.charAt(i + 1) == ' ' - || (i == source.length() - 2 - && source.charAt(i - 1) == 'O' - && source.charAt(i + 1) == 'E')) { - System.out.println("****** EXCEPTION " + source); + // Only copy a hyphen-minus if it is non-medial, or if it is + // the hyphen in U+1180 HANGUL JUNGSEONG O-E. + boolean medial; + if (0 == i || i == source.length() - 1) { + medial = false; // Name-initial or name-final. + } else { + medial = + Character.isLetterOrDigit(source.charAt(i - 1)) + && Character.isLetterOrDigit(source.charAt(i + 1)); + } + boolean is1180 = false; + if (medial + && i <= source.length() - 2 + && Character.toUpperCase(source.charAt(i + 1)) == 'E' + && result.toString().equals("HANGULJUNGSEONGO")) { + is1180 = true; + for (int j = i + 2; j < source.length(); ++j) { + if (source.charAt(j) != ' ' && source.charAt(j) != '_') { + is1180 = false; + } + } + } + if (!medial || is1180) { result.append(ch); } // otherwise don't copy - } else { + } else if (validate) { throw new IllegalArgumentException( "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); + } else if (ch != '_') { + result.append(ch); } } return result.toString(); } + public static String toNameSkeleton(String source) { + return toNameSkeleton(source, true); + } + /** * These routines use the Java functions, because they only need to act on ASCII Changes space, * - into _, inserts _ between lower and UPPER.