Supply the field indices and correct the types of NormalizationCorrections data (#1087)

eggrobin · web-flow · commit 3743816e71bf · 2025-04-09T18:46:16.000+02:00
* Supply the field indices and correct the types of NormalizationCorrections data * A test. * Better test and comment * @missing * Minimal fix to emoji_variation_sequence getSet * spots
diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java
@@ -732,8 +732,7 @@ public IndexUnicodeProperties getFactory() {
         @Override
         public boolean isTrivial() {
             return _getRawUnicodeMap().isEmpty()
-                    || ((_getRawUnicodeMap().stringKeys() == null
-                                    || _getRawUnicodeMap().stringKeys().isEmpty())
+                    || (!hasStrings()
                             && _getRawUnicodeMap()
                                     .keySet(_getRawUnicodeMap().getValue(0))
                                     .equals(UnicodeSet.ALL_CODE_POINTS));
@@ -798,6 +797,12 @@ protected UnicodeMap<String> _getRawUnicodeMap() {
             return load(prop);
         }
 
+        @Override
+        protected boolean hasStrings() {
+            return _getRawUnicodeMap().stringKeys() != null
+                    && !_getRawUnicodeMap().stringKeys().isEmpty();
+        }
+
         private UnicodeSet getDiffSet() {
             if (diffSet == null) {
                 diffSet =
@@ -836,10 +841,7 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
             }
             // We only do the delta thing for code points; for strings, we need to do the lookup
             // directly (and clean whatever was added by walking through history).
-            if (baseVersionProperties != null
-                    && (result.hasStrings()
-                            || (_getRawUnicodeMap().stringKeys() != null
-                                    && !_getRawUnicodeMap().stringKeys().isEmpty()))) {
+            if (baseVersionProperties != null && (result.hasStrings() || hasStrings())) {
                 result.removeAllStrings().addAll(super.getSet(matcher, new UnicodeSet()).strings());
             }
             return result;
diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java
@@ -94,6 +94,14 @@ public enum UcdProperty {
             null,
             ValueCardinality.Unordered,
             "cjkTraditionalVariant"),
+    normalization_correction_corrected(
+            PropertyType.String,
+            DerivedPropertyStatus.UCDNonProperty,
+            "normalization_correction_corrected"),
+    normalization_correction_original(
+            PropertyType.String,
+            DerivedPropertyStatus.UCDNonProperty,
+            "normalization_correction_original"),
 
     // Miscellaneous
     CJK_Radical(
@@ -497,14 +505,6 @@ public enum UcdProperty {
             null,
             ValueCardinality.Unordered,
             "cjkZhuangNumeric"),
-    normalization_correction_corrected(
-            PropertyType.Miscellaneous,
-            DerivedPropertyStatus.UCDNonProperty,
-            "normalization_correction_corrected"),
-    normalization_correction_original(
-            PropertyType.Miscellaneous,
-            DerivedPropertyStatus.UCDNonProperty,
-            "normalization_correction_original"),
     normalization_correction_version(
             PropertyType.Miscellaneous,
             DerivedPropertyStatus.UCDNonProperty,
diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
@@ -470,10 +470,14 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
 
     public static final String UNUSED = "??";
 
+    protected boolean hasStrings() {
+        return false;
+    }
+
     public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
         if (result == null) result = new UnicodeSet();
         boolean uniformUnassigned = hasUniformUnassigned();
-        if (isType(STRING_OR_MISC_MASK) && !isMultivalued) {
+        if (isType(STRING_OR_MISC_MASK) && !isMultivalued && !hasStrings()) {
             for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned);
                     usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i
                 int i = usi.codepoint;
diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt
@@ -52,6 +52,9 @@ cjkTraditionalVariant ; kTraditionalVariant ; Provisional
 
 Do_Not_Emit_Preferred ; Do_Not_Emit_Preferred ; UCDNonProperty
 
+normalization_correction_original ; normalization_correction_original ; UCDNonProperty
+normalization_correction_corrected ; normalization_correction_corrected ; UCDNonProperty
+
 # ================================================
 # Miscellaneous Properties
 # ================================================
@@ -65,8 +68,6 @@ NS ; Named_Sequences ; UCDNonProperty
 NSP ; Named_Sequences_Prov ; UCDNonProperty
 SV ; Standardized_Variant ; UCDNonProperty
 
-normalization_correction_original ; normalization_correction_original ; UCDNonProperty
-normalization_correction_corrected ; normalization_correction_corrected ; UCDNonProperty
 normalization_correction_version ; normalization_correction_version ; UCDNonProperty
 
 emoji_variation_sequence ; emoji_variation_sequence ; UCDNonProperty
diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt
@@ -326,6 +326,9 @@ Do_Not_Emit_Type ; Discouraged              ; Discouraged
 Do_Not_Emit_Type ; Preferred_Spelling       ; Preferred_Spelling
 Do_Not_Emit_Type ; Arabic_Tashkil           ; Arabic_Tashkil
 
+# @missing: 0000..10FFFF; normalization_correction_original; <none>
+# @missing: 0000..10FFFF; normalization_correction_corrected; <none>
+
 # Values from the old Provisional Indic_Matra_Category (identified with the Informative InPC in the
 # tools) and InSC.
 Indic_Positional_Category ; Invisible ; Invisible
diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt
@@ -328,9 +328,9 @@ NushuSources ; kReading
 TangutSources ; kRSTUnicode
 TangutSources ; kTGT_MergedSrc
 
-NormalizationCorrections ; normalization_correction_original
-NormalizationCorrections ; normalization_correction_corrected
-NormalizationCorrections ; normalization_correction_version
+NormalizationCorrections ; normalization_correction_original ; 1
+NormalizationCorrections ; normalization_correction_corrected ; 2
+NormalizationCorrections ; normalization_correction_version ; 3
 
 # Properties removed from Unihan before 5.1.
 # Point to a nonexistent file so that we don’t try to read them from the most recent monolithic
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -616,6 +616,17 @@ $combiningExclusions ⊇ [$singletons & \p{dt=canonical}]
 $combiningExclusions ⊇ [$nonstarter & \p{dt=canonical}]
 $combiningExclusions ⊇ [$firstNonStarter & \p{dt=canonical}]
 
+
+\p{normalization_correction_version≠@none@} = [陋 㛼 当 𤎫 竮 䗗]
+# Everything is corrected now.
+In \p{normalization_correction_version≠@none@}, normalization_correction_corrected = Decomposition_Mapping
+# This all happened between 3.1 and 4.0.
+In \p{normalization_correction_version≠@none@}, normalization_correction_corrected = U4.0:Decomposition_Mapping
+In \p{normalization_correction_version≠@none@}, normalization_correction_original  = U3.1:Decomposition_Mapping
+# The version field tells us whether the 3.2 mapping was wrong.
+In \p{normalization_correction_version=3.2.0}, U3.2:Decomposition_Mapping = normalization_correction_corrected
+In \p{normalization_correction_version=4.0.0}, U3.2:Decomposition_Mapping = normalization_correction_original
+
 ##########################
 # Other Invariant Tests, not in Stability Policies
 ##########################
@@ -1097,6 +1108,11 @@ Let $HairComponents := [\U0001F9B0-\U0001F9B3]
 # And nearly all emoji are Extended_Pictographic.
 [\p{Extended_Pictographic}-\p{gc=Cn}] = [\p{Emoji}-\p{Regional_Indicator}-\p{Emoji_Modifier}-\p{Block=Basic Latin}]
 
+# TODO(egg): We could check that the variation selectors are used consistently, but this would
+# require UnicodeProperty to support strings properly.
+\p{emoji_variation_sequence=text style} ⊃ [{♈︎}]
+\p{emoji_variation_sequence=emoji style} ⊃ [{♈️}]
+
 ##########################
 # POSIX Compatibility Properties (UTS#18)
 # http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
@@ -1321,6 +1337,10 @@ $unikemetScope ⊃ \P{kEH_Core=None}
 \p{kEH_Core=Legacy} ⊂ \p{Block=Egyptian Hieroglyphs}
 [\p{kEH_Core=None} & $unikemetScope] ⊂ \p{Block=Egyptian Hieroglyphs Extended-A}
 
+# TODO(egg): We could check that the variation selectors are used consistently, but this would
+# require UnicodeProperty to support strings properly.
+\p{Standardized_Variant=rotated 90 degrees} ⊃ [{𓏲\uFE00}]
+
 # InPC-InSC-gc invariants
 # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.
 \p{InPC=/(Left|Right)/} ⊆ [\p{gc=Mc}\p{gc=Lo}\p{gc=Lm}]