Skip to content

Commit 4e1e950

Browse files
authored
Ensure we have comprehensive coverage for the ICU4X bug (#669)
1 parent 51c579b commit 4e1e950

File tree

2 files changed

+70
-8
lines changed

2 files changed

+70
-8
lines changed

unicodetools/data/ucd/dev/NormalizationTest.txt

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# NormalizationTest-16.0.0.txt
2-
# Date: 2024-01-20, 01:49:31 GMT
2+
# Date: 2024-01-21, 18:36:20 GMT
33
# © 2023 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -67,7 +67,25 @@
6767
1100 AC00 11A8;1100 AC01;1100 1100 1161 11A8;1100 AC01;1100 1100 1161 11A8; # (ᄀ각; ᄀ각; ᄀ각; ᄀ각; ᄀ각; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK
6868
1100 AC00 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8;1100 AC01 11A8;1100 1100 1161 11A8 11A8; # (ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ᄀ각ᆨ; ) HANGUL CHOSEONG KIYEOK, HANGUL SYLLABLE GA, HANGUL JONGSEONG KIYEOK, HANGUL JONGSEONG KIYEOK
6969
01C4 0323;01C4 0323;01C4 0323;0044 1E92 030C;0044 005A 0323 030C; # (DŽ◌̣; DŽ◌̣; DŽ◌̣; DẒ◌̌; DZ◌̣◌̌; ) LATIN CAPITAL LETTER DZ WITH CARON, COMBINING DOT BELOW
70+
01C5 0323;01C5 0323;01C5 0323;0044 1E93 030C;0044 007A 0323 030C; # (Dž◌̣; Dž◌̣; Dž◌̣; Dẓ◌̌; Dz◌̣◌̌; ) LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON, COMBINING DOT BELOW
71+
01C6 0323;01C6 0323;01C6 0323;0064 1E93 030C;0064 007A 0323 030C; # (dž◌̣; dž◌̣; dž◌̣; dẓ◌̌; dz◌̣◌̌; ) LATIN SMALL LETTER DZ WITH CARON, COMBINING DOT BELOW
7072
0DDD 0334;0DDD 0334;0DD9 0DCF 0334 0DCA;0DDD 0334;0DD9 0DCF 0334 0DCA; # (ෝ◌̴; ෝ◌̴; ො◌̴◌්; ෝ◌̴; ො◌̴◌්; ) SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA, COMBINING TILDE OVERLAY
73+
3304 0334;3304 0334;3304 0334;30A4 30CB 30F3 30B0 0334;30A4 30CB 30F3 30AF 0334 3099; # (㌄◌̴; ㌄◌̴; ㌄◌̴; イニング◌̴; イニンク◌̴◌゙; ) SQUARE ININGU, COMBINING TILDE OVERLAY
74+
3307 0334;3307 0334;3307 0334;30A8 30B9 30AF 30FC 30C9 0334;30A8 30B9 30AF 30FC 30C8 0334 3099; # (㌇◌̴; ㌇◌̴; ㌇◌̴; エスクード◌̴; エスクート◌̴◌゙; ) SQUARE ESUKUUDO, COMBINING TILDE OVERLAY
75+
3310 0334;3310 0334;3310 0334;30AE 30AC 0334;30AD 3099 30AB 0334 3099; # (㌐◌̴; ㌐◌̴; ㌐◌̴; ギガ◌̴; キ◌゙カ◌̴◌゙; ) SQUARE GIGA, COMBINING TILDE OVERLAY
76+
331E 0334;331E 0334;331E 0334;30B3 30FC 30DD 0334;30B3 30FC 30DB 0334 309A; # (㌞◌̴; ㌞◌̴; ㌞◌̴; コーポ◌̴; コーホ◌̴◌゚; ) SQUARE KOOPO, COMBINING TILDE OVERLAY
77+
3321 0334;3321 0334;3321 0334;30B7 30EA 30F3 30B0 0334;30B7 30EA 30F3 30AF 0334 3099; # (㌡◌̴; ㌡◌̴; ㌡◌̴; シリング◌̴; シリンク◌̴◌゙; ) SQUARE SIRINGU, COMBINING TILDE OVERLAY
78+
3332 0334;3332 0334;3332 0334;30D5 30A1 30E9 30C3 30C9 0334;30D5 30A1 30E9 30C3 30C8 0334 3099; # (㌲◌̴; ㌲◌̴; ㌲◌̴; ファラッド◌̴; ファラット◌̴◌゙; ) SQUARE HUARADDO, COMBINING TILDE OVERLAY
79+
333B 0334;333B 0334;333B 0334;30DA 30FC 30B8 0334;30D8 309A 30FC 30B7 0334 3099; # (㌻◌̴; ㌻◌̴; ㌻◌̴; ページ◌̴; ヘ◌゚ーシ◌̴◌゙; ) SQUARE PEEZI, COMBINING TILDE OVERLAY
80+
3340 0334;3340 0334;3340 0334;30DD 30F3 30C9 0334;30DB 309A 30F3 30C8 0334 3099; # (㍀◌̴; ㍀◌̴; ㍀◌̴; ポンド◌̴; ホ◌゚ント◌̴◌゙; ) SQUARE PONDO, COMBINING TILDE OVERLAY
81+
334B 0334;334B 0334;334B 0334;30E1 30AC 0334;30E1 30AB 0334 3099; # (㍋◌̴; ㍋◌̴; ㍋◌̴; メガ◌̴; メカ◌̴◌゙; ) SQUARE MEGA, COMBINING TILDE OVERLAY
82+
334E 0334;334E 0334;334E 0334;30E4 30FC 30C9 0334;30E4 30FC 30C8 0334 3099; # (㍎◌̴; ㍎◌̴; ㍎◌̴; ヤード◌̴; ヤート◌̴◌゙; ) SQUARE YAADO, COMBINING TILDE OVERLAY
83+
FEF5 0656;FEF5 0656;FEF5 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻵ◌ٖ; ﻵ◌ٖ; ﻵ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF
84+
FEF6 0656;FEF6 0656;FEF6 0656;0644 0622 0656;0644 0627 0656 0653; # (ﻶ◌ٖ; ﻶ◌ٖ; ﻶ◌ٖ; لآ◌ٖ; لا◌ٖ◌ٓ; ) ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF
85+
FEF7 0656;FEF7 0656;FEF7 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻷ◌ٖ; ﻷ◌ٖ; ﻷ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE ISOLATED FORM, ARABIC SUBSCRIPT ALEF
86+
FEF8 0656;FEF8 0656;FEF8 0656;0644 0623 0656;0644 0627 0656 0654; # (ﻸ◌ٖ; ﻸ◌ٖ; ﻸ◌ٖ; لأ◌ٖ; لا◌ٖ◌ٔ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE FINAL FORM, ARABIC SUBSCRIPT ALEF
87+
FEF9 0334;FEF9 0334;FEF9 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻹ◌̴; ﻹ◌̴; ﻹ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW ISOLATED FORM, COMBINING TILDE OVERLAY
88+
FEFA 0334;FEFA 0334;FEFA 0334;0644 0625 0334;0644 0627 0334 0655; # (ﻺ◌̴; ﻺ◌̴; ﻺ◌̴; لإ◌̴; لا◌̴◌ٕ; ) ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW FINAL FORM, COMBINING TILDE OVERLAY
7189
#
7290
@Part1 # Character by character test
7391
# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.

unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import com.google.common.collect.ImmutableMap;
1414
import com.google.common.collect.ImmutableSet;
1515
import com.ibm.icu.text.UTF16;
16+
import com.ibm.icu.text.UnicodeSet;
1617
import java.io.IOException;
1718
import java.io.PrintWriter;
1819
import java.util.ArrayList;
@@ -23,6 +24,8 @@
2324
import java.util.TreeMap;
2425
import java.util.TreeSet;
2526
import java.util.function.Consumer;
27+
import org.unicode.props.IndexUnicodeProperties;
28+
import org.unicode.props.UcdProperty;
2629
import org.unicode.text.utility.Settings;
2730
import org.unicode.text.utility.UTF32;
2831
import org.unicode.text.utility.UnicodeDataFile;
@@ -811,6 +814,54 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
811814
for (final String testSuiteCase : testSuiteCases) {
812815
writeLine(testSuiteCase, log, false);
813816
}
817+
// At least one implementation (ICU4X) has an edge case when a character
818+
// whose decomposition contains multiple starters and ends with a
819+
// non-starter is followed by a non-starter of lower CCC.
820+
// See https://github.com/unicode-org/unicodetools/issues/656
821+
// and https://github.com/unicode-org/icu4x/pull/4530.
822+
// That implementation also has separate code paths for the BMP and
823+
// higher planes. No such decompositions currently exist outside the
824+
// BMP, but by generating these test cases we ensure that this would be
825+
// covered.
826+
// We stick them in Part 0, which is in principle for handcrafted test
827+
// cases, because there are not many of them, and the edge case feels a
828+
// tad too weird to describe in the title of a new part.
829+
final org.unicode.props.UnicodeProperty sc =
830+
IndexUnicodeProperties.make().getProperty(UcdProperty.Script);
831+
for (final String cp : UnicodeSet.ALL_CODE_POINTS) {
832+
final String[] decompositions =
833+
new String[] {Default.nfd().normalize(cp), Default.nfkd().normalize(cp)};
834+
for (final String decomposition : decompositions) {
835+
final int lastCCC =
836+
Default.ucd()
837+
.getCombiningClass(
838+
decomposition.codePointBefore(decomposition.length()));
839+
final long nonStarterCount =
840+
decomposition
841+
.codePoints()
842+
.filter(c -> (Default.ucd().getCombiningClass(c) == 0))
843+
.count();
844+
final String script = sc.getValue(cp.codePointAt(0));
845+
if (lastCCC > 1 && nonStarterCount > 1) {
846+
// Try to pick a trailing nonstarter that might have a
847+
// chance of combining with the character if possible,
848+
// both for æsthetic reasons and to reproduce the example
849+
// ICU4X came across. If all else fails, use a character
850+
// with CCC=1, as low as it gets.
851+
if (script.equals("Arabic") && lastCCC > 220) {
852+
// ARABIC SUBSCRIPT ALEF.
853+
writeLine(cp + "\u0656", log, false);
854+
} else if (lastCCC > 220) {
855+
// COMBINING DOT BELOW.
856+
writeLine(cp + "\u0323", log, false);
857+
} else {
858+
// COMBINING TILDE OVERLAY.
859+
writeLine(cp + "\u0334", log, false);
860+
}
861+
break;
862+
}
863+
}
864+
}
814865

815866
System.out.println("Writing Part 2");
816867

@@ -1318,13 +1369,6 @@ static final String comma(String s) {
13181369
"\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD",
13191370
"\u1100\uAC00\u11A8",
13201371
"\u1100\uAC00\u11A8\u11A8",
1321-
// Some implementations have an edge case when a character whose
1322-
// decomposition contains multiple starters and ends with a non-starter
1323-
// is followed by a non-starter of lower CCC.
1324-
// See https://github.com/unicode-org/unicodetools/issues/656
1325-
// and https://github.com/unicode-org/icu4x/pull/4530.
1326-
"\u01C4\u0323",
1327-
"\u0DDD\u0334",
13281372
};
13291373
/*
13301374
static final void backwardsCompat(String directory, String filename, int[] list) throws IOException {

0 commit comments

Comments
 (0)