LM2 Name and Name_Alias matching (#1059)

eggrobin · markusicu · web-flow · commit 1450868fea14 · 2025-03-13T23:20:44.000+01:00
* Don’t invert twice on comparison queries, add support for null queries, align identity queries with the draft * more tests * LM3 is * comments * out of bounds * Check lb=@none@ (though that should probably be an error). * Millionfold falsification * Need to figure out how to make Name_Alias behave as an alias for Name… * Name_Alias as a Name alias; failing test (on ne fait pas d’omelette sans casser des œufs) * Put Humpty Dumpty together again. * uppercase once Co-authored-by: Markus Scherer <markus.icu@gmail.com> * Don’t yell about non-medial hyphens nor the hyphen in U+1180. * s/ff/ild/g * isLetterOrDigit --------- Co-authored-by: Markus Scherer <markus.icu@gmail.com>
diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java
@@ -14,6 +14,8 @@
 import java.util.Map;
 import java.util.regex.Pattern;
 import org.unicode.cldr.util.MultiComparator;
+import org.unicode.props.IndexUnicodeProperties;
+import org.unicode.props.UcdProperty;
 import org.unicode.props.UcdPropertyValues;
 import org.unicode.props.UnicodeProperty;
 import org.unicode.props.UnicodeProperty.PatternMatcher;
@@ -340,6 +342,15 @@ private boolean applyPropertyAlias0(
                             }
                         }
                         set = prop.getSet(propertyValue);
+                        if (set.isEmpty()
+                                && prop instanceof IndexUnicodeProperties.IndexUnicodeProperty
+                                && prop.getName().equals("Name")) {
+                            set =
+                                    ((IndexUnicodeProperties.IndexUnicodeProperty) prop)
+                                            .getFactory()
+                                            .getProperty(UcdProperty.Name_Alias)
+                                            .getSet(propertyValue);
+                        }
                     }
                 } else if (isAge) {
                     set = new UnicodeSet();
diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java
@@ -175,6 +175,53 @@ public void TestInteriorlyNegatedComparison() {
                 "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]");
     }
 
+    @Test
+    public void TestNameMatching() {
+        // UAX44-LM2 for both Name and Name_Alias.
+        checkSetsEqual("\\p{Name=NO-BREAK SPACE}", "[\\xA0]");
+        checkSetsEqual("\\p{Name=no break space}", "[\\xA0]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-E}", "[\\u1180]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE}", "[\\u116C]");
+        checkSetsEqual("\\p{Name=Hangul jungseong o-e}", "[\\u1180]");
+        checkSetsEqual("\\p{Name=Hangul jungseong oe}", "[\\u116C]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -E}", "[\\u1180]");
+        checkSetsEqual("\\p{Name= HANGUL JUNGSEONG O-E }", "[\\u1180]");
+        checkSetsEqual("\\p{Name=_HANGUL_JUNGSEONG_O-E_}", "[\\u1180]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O-EO}", "[\\u117F]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG OE O}", "[\\u117F]");
+        checkSetsEqual("\\p{Name=HANGUL JUNGSEONG O -EO}", "[]");
+        checkSetsEqual("\\p{Name=MARCHEN LETTER -A}", "[\\x{11C88}]");
+        checkSetsEqual("\\p{Name=MARCHEN_LETTER_-A}", "[\\x{11C88}]");
+        checkSetsEqual("\\p{Name=MARCHEN LETTER A}", "[\\x{11C8F}]");
+        checkSetsEqual("\\p{Name=TIBETAN MARK TSA -PHRU}", "[\\u0F39]");
+        checkSetsEqual("\\p{Name=TIBETAN MARK TSA PHRU}", "[]");
+        checkSetsEqual("\\p{Name=TIBETAN MARK BKA- SHOG YIG MGO}", "[\\u0F0A]");
+        checkSetsEqual("\\p{Name=TIBETAN MARK BKA SHOG YIG MGO}", "[]");
+        checkSetsEqual("\\p{Name_Alias=newline}", "[\\x0A]");
+        checkSetsEqual("\\p{Name_Alias=NEW LINE}", "[\\x0A]");
+        // The medial hyphen is only significant in HANGUL JUNGSEONG O-E, not in arbitrary O-E/OE.
+        checkSetsEqual("\\p{Name=twoemdash}", "⸺");
+        checkSetsEqual("\\p{Name=SeeNoEvil_Monkey}", "🙈");
+        checkSetsEqual("\\p{Name=BALLET S-H-O-E-S}", "🩰");
+        checkSetsEqual("[\\p{Name=LATIN SMALL LIGATURE O-E}uf]", "[œuf]");
+    }
+
+    @Test
+    public void TestNameAliases() {
+        // Name_Alias values behave as aliases for Name, but not vice-versa.
+        checkSetsEqual(
+                "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}", "[︘]");
+        checkSetsEqual(
+                "\\p{Name=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}", "[︘]");
+        checkSetsEqual(
+                "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}",
+                "[]");
+        checkSetsEqual(
+                "\\p{Name_Alias=PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}",
+                "[︘]");
+        checkProperties("\\p{Name_Alias=@none@}", "[a-z]");
+    }
+
     @Test
     public void TestIdentityQuery() {
         checkSetsEqual("\\p{NFKC_Casefold=@code point@}", "\\P{Changes_When_NFKC_Casefolded}");
diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java
@@ -196,6 +196,7 @@ public Map<UcdProperty, Long> getCacheFileSize() {
     static final Transform<String, String> fromNumericPinyin =
             Transliterator.getInstance("NumericPinyin-Latin;nfc");
 
+    static final Merge<String> MULTIVALUED_JOINER = new PropertyUtilities.Joiner("|");
     static final Merge<String> ALPHABETIC_JOINER =
             new Merge<String>() {
                 TreeSet<String> sorted = new TreeSet<String>();
@@ -684,7 +685,7 @@ public VersionInfo getUcdVersion() {
     //        .get(toSkeleton(propertyAlias));
     //    }
 
-    class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
+    public class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
 
         private final UcdProperty prop;
         private final Map<String, PropertyNames> stringToNamedEnum;
@@ -724,6 +725,10 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
             }
         }
 
+        public IndexUnicodeProperties getFactory() {
+            return IndexUnicodeProperties.this;
+        }
+
         @Override
         public boolean isTrivial() {
             return _getRawUnicodeMap().isEmpty()
diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java
@@ -884,7 +884,7 @@ private static void parsePropertyValueFile(
                                         && indexUnicodeProperties.ucdVersion.compareTo(
                                                         VersionInfo.UNICODE_4_0)
                                                 <= 0
-                                ? new PropertyUtilities.Joiner("|")
+                                ? IndexUnicodeProperties.MULTIVALUED_JOINER
                                 : null;
                 final var originalMultivaluedSplit = propInfo.multivaluedSplit;
                 // The first version of kPrimaryNumeric had spaces in values.
@@ -995,7 +995,7 @@ private static void parseNameAliasesFile(
                     indexUnicodeProperties,
                     nextProperties,
                     propInfoSet,
-                    IndexUnicodeProperties.ALPHABETIC_JOINER,
+                    IndexUnicodeProperties.MULTIVALUED_JOINER,
                     false);
         }
     }
diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
@@ -448,7 +448,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
                             ? NULL_MATCHER
                             : new SimpleMatcher(
                                     propertyValue,
-                                    isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR),
+                                    getName().equals("Name") || getName().equals("Name_Alias")
+                                            ? CHARACTER_NAME_COMPARATOR
+                                            : isType(STRING_OR_MISC_MASK)
+                                                    ? null
+                                                    : PROPERTY_COMPARATOR),
                     result);
         }
     }
@@ -720,39 +724,83 @@ public static String toSkeleton(String source) {
         return skeletonBuffer.toString();
     }
 
-    /** Returns a representative of the equivalence class of source under UAX44-LM2. */
-    public static String toNameSkeleton(String source) {
+    public static final Comparator<String> CHARACTER_NAME_COMPARATOR =
+            new Comparator<String>() {
+                @Override
+                public int compare(String o1, String o2) {
+                    return compareCharacterNames(o1, o2);
+                }
+            };
+
+    public static int compareCharacterNames(String a, String b) {
+        if (a == b) return 0;
+        if (a == null) return -1;
+        if (b == null) return 1;
+        return toNameSkeleton(a, false).compareTo(toNameSkeleton(b, false));
+    }
+
+    /**
+     * Returns a representative of the equivalence class of source under UAX44-LM2. If
+     * validate=true, checks that source contains only characters allowed in character names.
+     */
+    public static String toNameSkeleton(String source, boolean validate) {
         if (source == null) return null;
-        StringBuffer result = new StringBuffer();
+        StringBuilder result = new StringBuilder();
         // remove spaces, medial '-'
         // we can do this with char, since no surrogates are involved
         for (int i = 0; i < source.length(); ++i) {
             char ch = source.charAt(i);
+            final char uppercase = Character.toUpperCase(ch);
+            if (validate && uppercase != ch) {
+                throw new IllegalArgumentException(
+                        "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
+            }
+            ch = uppercase;
             if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') {
                 result.append(ch);
             } else if (ch == ' ') {
                 // don't copy ever
             } else if (ch == '-') {
-                // only copy non-medials AND trailing O-E
-                if (0 == i
-                        || i == source.length() - 1
-                        || source.charAt(i - 1) == ' '
-                        || source.charAt(i + 1) == ' '
-                        || (i == source.length() - 2
-                                && source.charAt(i - 1) == 'O'
-                                && source.charAt(i + 1) == 'E')) {
-                    System.out.println("****** EXCEPTION " + source);
+                // Only copy a hyphen-minus if it is non-medial, or if it is
+                // the hyphen in U+1180 HANGUL JUNGSEONG O-E.
+                boolean medial;
+                if (0 == i || i == source.length() - 1) {
+                    medial = false; // Name-initial or name-final.
+                } else {
+                    medial =
+                            Character.isLetterOrDigit(source.charAt(i - 1))
+                                    && Character.isLetterOrDigit(source.charAt(i + 1));
+                }
+                boolean is1180 = false;
+                if (medial
+                        && i <= source.length() - 2
+                        && Character.toUpperCase(source.charAt(i + 1)) == 'E'
+                        && result.toString().equals("HANGULJUNGSEONGO")) {
+                    is1180 = true;
+                    for (int j = i + 2; j < source.length(); ++j) {
+                        if (source.charAt(j) != ' ' && source.charAt(j) != '_') {
+                            is1180 = false;
+                        }
+                    }
+                }
+                if (!medial || is1180) {
                     result.append(ch);
                 }
                 // otherwise don't copy
-            } else {
+            } else if (validate) {
                 throw new IllegalArgumentException(
                         "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch);
+            } else if (ch != '_') {
+                result.append(ch);
             }
         }
         return result.toString();
     }
 
+    public static String toNameSkeleton(String source) {
+        return toNameSkeleton(source, true);
+    }
+
     /**
      * These routines use the Java functions, because they only need to act on ASCII Changes space,
      * - into _, inserts _ between lower and UPPER.