Skip to content

Commit 5471274

Browse files
authored
CLDR CollationTest: omit simplified radicals (#914)
* CLDR CollationTest: omit simplified radicals * explain more original-Unihan characters out of order
1 parent ba82a85 commit 5471274

File tree

1 file changed

+41
-7
lines changed

1 file changed

+41
-7
lines changed

unicodetools/src/main/java/org/unicode/text/UCA/RadicalStroke.java

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ public final class RadicalStroke {
5353
/** Radical strings. Avoid constructing them over and over. */
5454
private String[] radicalStrings = new String[(MAX_RADICAL_NUMBER + 1) << SIMPLIFIED_NUM_BITS];
5555

56+
private final UnicodeSet simplifiedRadicals = new UnicodeSet();
57+
5658
/**
5759
* Han characters for which code point order == radical-stroke order. Hand-picked exceptions
5860
* that are hard to detect optimally (because there are 2 or 3 in a row out of order) are
@@ -172,17 +174,46 @@ public RadicalStroke(String unicodeVersion) {
172174
System.out.println("hanInCPOrder = " + hanInCPOrder.toPattern(false));
173175
int numOutOfOrder = LAST_UNIHAN_11 + 1 - 0x4e00 - hanInCPOrder.size();
174176
System.out.println("number of original-Unihan characters out of order: " + numOutOfOrder);
175-
// In Unicode 7.0, there are 313 original-Unihan characters out of order.
176-
// If this number is much higher, then either the data has changed a lot,
177+
// In CLDR 26..46, we generated a radical-stroke order where
178+
// we treated traditional vs. simplified forms of radicals as different radicals.
179+
// Effectively, the simplified-ness of a radical had more weight than
180+
// the number of residual strokes.
181+
// This apparently matched the allocation of the original Unihan characters.
182+
// Sample Unicode 16 kRSUnicode data, where
183+
// the characters with radical 120 as a group precede the ones with 120':
184+
// 7E97 120.18
185+
// 7E98..7E9B 120.19
186+
// 7E9C..7E9D 120.21
187+
// 7E9E 120.23
188+
// 7E9F 120'.0
189+
// 7EA0 120'.2
190+
// 7EA1..7EAB 120'.3
191+
// 7EAC..7EB5 120'.4
192+
//
193+
// Based on that, in Unicode 7.0, there were 313 original-Unihan characters out of order.
194+
// The following assertion was put here as a trip wire, testing numOutOfOrder <= 320.
195+
//
196+
// In CLDR 46 (2024), we changed the sort order to conform to UAX #38,
197+
// demoting the simplified-ness of radicals to below the number of residual strokes.
198+
// For example, looking at the small sample above,
199+
// characters 7E9F..7EB5 sort before 7E97 because they have fewer residual strokes.
200+
// As a result, the improved radical-stroke order matches the
201+
// original-Unihan code point order less well, and
202+
// we got 1446 of these characters sorting differently.
203+
//
204+
// If this number is suddenly much higher, then either the data has changed a lot,
177205
// or there is a bug in the code.
178206
// Turn on the DEBUG flag and see if we can manually remove some characters from the set
179207
// so that a sequence of following ones does not get removed.
180-
// TODO: Before changing the sort order to conform to UAX #38, demoting the simplified-ness
181-
// of radicals to below the number of residual strokes,
182-
// this successfully asserted numOutOfOrder <= 320.
183-
// Find out if this is a known issue.
184208
assert numOutOfOrder <= 1500;
185-
hanNotInCPOrder = new UnicodeSet(hanSet).removeAll(hanInCPOrder).freeze();
209+
// Exclude simplifiedRadicals so that WriteConformanceTest omits those.
210+
// The test data should work with both implicit-han and radical-stroke orders.
211+
// CLDR 46 changes radical-stroke order to match UAX #38,
212+
// which intermingles characters with traditional and simplified radicals,
213+
// different from CLDR 26..45 where
214+
// simplified radicals strongly sorted after traditional ones.
215+
hanNotInCPOrder =
216+
new UnicodeSet(hanSet).removeAll(hanInCPOrder).addAll(simplifiedRadicals).freeze();
186217
}
187218

188219
// Triples of (start, end, extension) for coalesced UAX #38 order blocks.
@@ -487,6 +518,9 @@ private void getCJKRadicals(IndexUnicodeProperties iup) {
487518
int radicalChar = Integer.parseInt(parts[1], 16);
488519
assert 0 < radicalChar;
489520
assert radicalChar < 0x3000; // should be a radical code point
521+
if ((radicalNumberAndSimplified & 3) != 0) {
522+
simplifiedRadicals.add(radicalChar);
523+
}
490524
radToChar[radicalNumberAndSimplified] =
491525
radicalCharString = Character.toString((char) radicalChar);
492526
// radToChar[] remains null if there is no radical character.

0 commit comments

Comments
 (0)