Ensure we have comprehensive coverage for the ICU4X bug (#669)

eggrobin · web-flow · commit 4e1e950b352f · 2024-01-23T01:54:43.000+01:00
diff --git a/unicodetools/data/ucd/dev/NormalizationTest.txt b/unicodetools/data/ucd/dev/NormalizationTest.txt
@@ -1,5 +1,5 @@
 # NormalizationTest-16.0.0.txt
-# Date: 2024-01-20, 01:49:31 GMT
+# Date: 2024-01-21, 18:36:20 GMT
 # © 2023 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -67,7 +67,25 @@
 1100 AC00 11A8;1100 AC01;1100 1100 1161 11A8;1100 AC01;1100 1100 1161 11A8; # (ᄀ각; ᄀ각; ᄀ각; ᄀ각; ᄀ각; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK
 1100 AC00 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8; # (ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK, HANGUL JONGSEONG KIYEOK
 01C4 0323;01C4 0323;01C4 0323;0044 1E92 030C;0044 005A 0323 030C; # (Ǆ◌̣; Ǆ◌̣; Ǆ◌̣; DẒ◌̌; DZ◌̣◌̌; ) LATIN CAPITAL LETTER DZ WITH CARON, COMBINING DOT BELOW
+01C5 0323;01C5 0323;01C5 0323;0044 1E93 030C;0044 007A 0323 030C; # (ǅ◌̣; ǅ◌̣; ǅ◌̣; Dẓ◌̌; Dz◌̣◌̌; ) LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON, COMBINING DOT BELOW
+01C6 0323;01C6 0323;01C6 0323;0064 1E93 030C;0064 007A 0323 030C; # (ǆ◌̣; ǆ◌̣; ǆ◌̣; dẓ◌̌; dz◌̣◌̌; ) LATIN SMALL LETTER DZ WITH CARON, COMBINING DOT BELOW
 0DDD 0334;0DDD 0334;0DD9 0DCF 0334 0DCA;0DDD 0334;0DD9 0DCF 0334 0DCA; # (ෝ◌̴; ෝ◌̴; ො◌̴◌්; ෝ◌̴; ො◌̴◌්; ) SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA, COMBINING TILDE OVERLAY
+3304 0334;3304 0334;3304 0334;30A4 30CB 30F3 30B0 0334;30A4 30CB 30F3 30AF 0334 3099; # (㌄◌̴; ㌄◌̴; ㌄◌̴; イニング◌̴; イニンク◌̴◌゙; ) SQUARE ININGU, COMBINING TILDE OVERLAY
+3307 0334;3307 0334;3307 0334;30A8 30B9 30AF 30FC 30C9 0334;30A8 30B9 30AF 30FC 30C8 0334 3099; # (㌇◌̴; ㌇◌̴; ㌇◌̴; エスクード◌̴; エスクート◌̴◌゙; ) SQUARE ESUKUUDO, COMBINING TILDE OVERLAY
+3310 0334;3310 0334;3310 0334;30AE 30AC 0334;30AD 3099 30AB 0334 3099; # (㌐◌̴; ㌐◌̴; ㌐◌̴; ギガ◌̴; キ◌゙カ◌̴◌゙; ) SQUARE GIGA, COMBINING TILDE OVERLAY
+331E 0334;331E 0334;331E 0334;30B3 30FC 30DD 0334;30B3 30FC 30DB 0334 309A; # (㌞◌̴; ㌞◌̴; ㌞◌̴; コーポ◌̴; コーホ◌̴◌゚; ) SQUARE KOOPO, COMBINING TILDE OVERLAY
+3321 0334;3321 0334;3321 0334;30B7 30EA 30F3 30B0 0334;30B7 30EA 30F3 30AF 0334 3099; # (㌡◌̴; ㌡◌̴; ㌡◌̴; シリング◌̴; シリンク◌̴◌゙; ) SQUARE SIRINGU, COMBINING TILDE OVERLAY
+3332 0334;3332 0334;3332 0334;30D5 30A1 30E9 30C3 30C9 0334;30D5 30A1 30E9 30C3 30C8 0334 3099; # (㌲◌̴; ㌲◌̴; ㌲◌̴; ファラッド◌̴; ファラット◌̴◌゙; ) SQUARE HUARADDO, COMBINING TILDE OVERLAY
+333B 0334;333B 0334;333B 0334;30DA 30FC 30B8 0334;30D8 309A 30FC 30B7 0334 3099; # (㌻◌̴; ㌻◌̴; ㌻◌̴; ページ◌̴; ヘ◌゚ーシ◌̴◌゙; ) SQUARE PEEZI, COMBINING TILDE OVERLAY
+3340 0334;3340 0334;3340 0334;30DD 30F3 30C9 0334;30DB 309A 30F3 30C8 0334 3099; # (㍀◌̴; ㍀◌̴; ㍀◌̴; ポンド◌̴; ホ◌゚ント◌̴◌゙; ) SQUARE PONDO, COMBINING TILDE OVERLAY
+334B 0334;334B 0334;334B 0334;30E1 30AC 0334;30E1 30AB 0334 3099; # (㍋◌̴; ㍋◌̴; ㍋◌̴; メガ◌̴; メカ◌̴◌゙; ) SQUARE MEGA, COMBINING TILDE OVERLAY
+334E 0334;334E 0334;334E 0334;30E4 30FC 30C9 0334;30E4 30FC 30C8 0334 3099; # (㍎◌̴; ㍎◌̴; ㍎◌̴; ヤード◌̴; ヤート◌̴◌゙; ) SQUARE YAADO, COMBINING TILDE OVERLAY
+FEF5 0656;FEF5 0656;FEF5 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻵ◌ٖ; ﻵ◌ٖ; ﻵ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF
+FEF6 0656;FEF6 0656;FEF6 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻶ◌ٖ; ﻶ◌ٖ; ﻶ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF
+FEF7 0656;FEF7 0656;FEF7 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻷ◌ٖ; ﻷ◌ٖ; ﻷ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF
+FEF8 0656;FEF8 0656;FEF8 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻸ◌ٖ; ﻸ◌ٖ; ﻸ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF
+FEF9 0334;FEF9 0334;FEF9 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻹ◌̴; ﻹ◌̴; ﻹ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW ISOLATED FORM, COMBINING TILDE OVERLAY
+FEFA 0334;FEFA 0334;FEFA 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻺ◌̴; ﻺ◌̴; ﻺ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW FINAL FORM, COMBINING TILDE OVERLAY
 #
 @Part1 # Character by character test
 # All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java
@@ -13,6 +13,7 @@
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
@@ -23,6 +24,8 @@
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.function.Consumer;
+import org.unicode.props.IndexUnicodeProperties;
+import org.unicode.props.UcdProperty;
 import org.unicode.text.utility.Settings;
 import org.unicode.text.utility.UTF32;
 import org.unicode.text.utility.UnicodeDataFile;
@@ -811,6 +814,54 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
         for (final String testSuiteCase : testSuiteCases) {
             writeLine(testSuiteCase, log, false);
         }
+        // At least one implementation (ICU4X) has an edge case when a character
+        // whose decomposition contains multiple starters and ends with a
+        // non-starter is followed by a non-starter of lower CCC.
+        // See https://github.com/unicode-org/unicodetools/issues/656
+        // and https://github.com/unicode-org/icu4x/pull/4530.
+        // That implementation also has separate code paths for the BMP and
+        // higher planes.  No such decompositions currently exist outside the
+        // BMP, but by generating these test cases we ensure that this would be
+        // covered.
+        // We stick them in Part 0, which is in principle for handcrafted test
+        // cases, because there are not many of them, and the edge case feels a
+        // tad too weird to describe in the title of a new part.
+        final org.unicode.props.UnicodeProperty sc =
+                IndexUnicodeProperties.make().getProperty(UcdProperty.Script);
+        for (final String cp : UnicodeSet.ALL_CODE_POINTS) {
+            final String[] decompositions =
+                    new String[] {Default.nfd().normalize(cp), Default.nfkd().normalize(cp)};
+            for (final String decomposition : decompositions) {
+                final int lastCCC =
+                        Default.ucd()
+                                .getCombiningClass(
+                                        decomposition.codePointBefore(decomposition.length()));
+                final long nonStarterCount =
+                        decomposition
+                                .codePoints()
+                                .filter(c -> (Default.ucd().getCombiningClass(c) == 0))
+                                .count();
+                final String script = sc.getValue(cp.codePointAt(0));
+                if (lastCCC > 1 && nonStarterCount > 1) {
+                    // Try to pick a trailing nonstarter that might have a
+                    // chance of combining with the character if possible,
+                    // both for æsthetic reasons and to reproduce the example
+                    // ICU4X came across.  If all else fails, use a character
+                    // with CCC=1, as low as it gets.
+                    if (script.equals("Arabic") && lastCCC > 220) {
+                        // ARABIC SUBSCRIPT ALEF.
+                        writeLine(cp + "\u0656", log, false);
+                    } else if (lastCCC > 220) {
+                        // COMBINING DOT BELOW.
+                        writeLine(cp + "\u0323", log, false);
+                    } else {
+                        // COMBINING TILDE OVERLAY.
+                        writeLine(cp + "\u0334", log, false);
+                    }
+                    break;
+                }
+            }
+        }
 
         System.out.println("Writing Part 2");
 
@@ -1318,13 +1369,6 @@ static final String comma(String s) {
         "\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD",
         "\u1100\uAC00\u11A8",
         "\u1100\uAC00\u11A8\u11A8",
-        // Some implementations have an edge case when a character whose
-        // decomposition contains multiple starters and ends with a non-starter
-        // is followed by a non-starter of lower CCC.
-        // See https://github.com/unicode-org/unicodetools/issues/656
-        // and https://github.com/unicode-org/icu4x/pull/4530.
-        "\u01C4\u0323",
-        "\u0DDD\u0334",
     };
     /*
     static final void backwardsCompat(String directory, String filename, int[] list) throws IOException {