|
13 | 13 | import com.google.common.collect.ImmutableMap; |
14 | 14 | import com.google.common.collect.ImmutableSet; |
15 | 15 | import com.ibm.icu.text.UTF16; |
| 16 | +import com.ibm.icu.text.UnicodeSet; |
16 | 17 | import java.io.IOException; |
17 | 18 | import java.io.PrintWriter; |
18 | 19 | import java.util.ArrayList; |
|
23 | 24 | import java.util.TreeMap; |
24 | 25 | import java.util.TreeSet; |
25 | 26 | import java.util.function.Consumer; |
| 27 | +import org.unicode.props.IndexUnicodeProperties; |
| 28 | +import org.unicode.props.UcdProperty; |
26 | 29 | import org.unicode.text.utility.Settings; |
27 | 30 | import org.unicode.text.utility.UTF32; |
28 | 31 | import org.unicode.text.utility.UnicodeDataFile; |
@@ -811,6 +814,54 @@ public static void writeNormalizerTestSuite(String directory, String fileName) |
811 | 814 | for (final String testSuiteCase : testSuiteCases) { |
812 | 815 | writeLine(testSuiteCase, log, false); |
813 | 816 | } |
| 817 | + // At least one implementation (ICU4X) has an edge case when a character |
| 818 | + // whose decomposition contains multiple starters and ends with a |
| 819 | + // non-starter is followed by a non-starter of lower CCC. |
| 820 | + // See https://github.com/unicode-org/unicodetools/issues/656 |
| 821 | + // and https://github.com/unicode-org/icu4x/pull/4530. |
| 822 | + // That implementation also has separate code paths for the BMP and |
| 823 | + // higher planes. No such decompositions currently exist outside the |
| 824 | + // BMP, but by generating these test cases we ensure that this would be |
| 825 | + // covered. |
| 826 | + // We stick them in Part 0, which is in principle for handcrafted test |
| 827 | + // cases, because there are not many of them, and the edge case feels a |
| 828 | + // tad too weird to describe in the title of a new part. |
| 829 | + final org.unicode.props.UnicodeProperty sc = |
| 830 | + IndexUnicodeProperties.make().getProperty(UcdProperty.Script); |
| 831 | + for (final String cp : UnicodeSet.ALL_CODE_POINTS) { |
| 832 | + final String[] decompositions = |
| 833 | + new String[] {Default.nfd().normalize(cp), Default.nfkd().normalize(cp)}; |
| 834 | + for (final String decomposition : decompositions) { |
| 835 | + final int lastCCC = |
| 836 | + Default.ucd() |
| 837 | + .getCombiningClass( |
| 838 | + decomposition.codePointBefore(decomposition.length())); |
| 839 | + final long nonStarterCount = |
| 840 | + decomposition |
| 841 | + .codePoints() |
| 842 | + .filter(c -> (Default.ucd().getCombiningClass(c) == 0)) |
| 843 | + .count(); |
| 844 | + final String script = sc.getValue(cp.codePointAt(0)); |
| 845 | + if (lastCCC > 1 && nonStarterCount > 1) { |
| 846 | + // Try to pick a trailing nonstarter that might have a |
| 847 | + // chance of combining with the character if possible, |
| 848 | + // both for æsthetic reasons and to reproduce the example |
| 849 | + // ICU4X came across. If all else fails, use a character |
| 850 | + // with CCC=1, as low as it gets. |
| 851 | + if (script.equals("Arabic") && lastCCC > 220) { |
| 852 | + // ARABIC SUBSCRIPT ALEF. |
| 853 | + writeLine(cp + "\u0656", log, false); |
| 854 | + } else if (lastCCC > 220) { |
| 855 | + // COMBINING DOT BELOW. |
| 856 | + writeLine(cp + "\u0323", log, false); |
| 857 | + } else { |
| 858 | + // COMBINING TILDE OVERLAY. |
| 859 | + writeLine(cp + "\u0334", log, false); |
| 860 | + } |
| 861 | + break; |
| 862 | + } |
| 863 | + } |
| 864 | + } |
814 | 865 |
|
815 | 866 | System.out.println("Writing Part 2"); |
816 | 867 |
|
@@ -1318,13 +1369,6 @@ static final String comma(String s) { |
1318 | 1369 | "\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD", |
1319 | 1370 | "\u1100\uAC00\u11A8", |
1320 | 1371 | "\u1100\uAC00\u11A8\u11A8", |
1321 | | - // Some implementations have an edge case when a character whose |
1322 | | - // decomposition contains multiple starters and ends with a non-starter |
1323 | | - // is followed by a non-starter of lower CCC. |
1324 | | - // See https://github.com/unicode-org/unicodetools/issues/656 |
1325 | | - // and https://github.com/unicode-org/icu4x/pull/4530. |
1326 | | - "\u01C4\u0323", |
1327 | | - "\u0DDD\u0334", |
1328 | 1372 | }; |
1329 | 1373 | /* |
1330 | 1374 | static final void backwardsCompat(String directory, String filename, int[] list) throws IOException { |
|
0 commit comments