@@ -53,6 +53,8 @@ public final class RadicalStroke {
5353 /** Radical strings. Avoid constructing them over and over. */
5454 private String [] radicalStrings = new String [(MAX_RADICAL_NUMBER + 1 ) << SIMPLIFIED_NUM_BITS ];
5555
56+ private final UnicodeSet simplifiedRadicals = new UnicodeSet ();
57+
5658 /**
5759 * Han characters for which code point order == radical-stroke order. Hand-picked exceptions
5860 * that are hard to detect optimally (because there are 2 or 3 in a row out of order) are
@@ -172,17 +174,46 @@ public RadicalStroke(String unicodeVersion) {
172174 System .out .println ("hanInCPOrder = " + hanInCPOrder .toPattern (false ));
173175 int numOutOfOrder = LAST_UNIHAN_11 + 1 - 0x4e00 - hanInCPOrder .size ();
174176 System .out .println ("number of original-Unihan characters out of order: " + numOutOfOrder );
175- // In Unicode 7.0, there are 313 original-Unihan characters out of order.
176- // If this number is much higher, then either the data has changed a lot,
177+ // In CLDR 26..46, we generated a radical-stroke order where
178+ // we treated traditional vs. simplified forms of radicals as different radicals.
179+ // Effectively, the simplified-ness of a radical had more weight than
180+ // the number of residual strokes.
181+ // This apparently matched the allocation of the original Unihan characters.
182+ // Sample Unicode 16 kRSUnicode data, where
183+ // the characters with radical 120 as a group precede the ones with 120':
184+ // 7E97 120.18
185+ // 7E98..7E9B 120.19
186+ // 7E9C..7E9D 120.21
187+ // 7E9E 120.23
188+ // 7E9F 120'.0
189+ // 7EA0 120'.2
190+ // 7EA1..7EAB 120'.3
191+ // 7EAC..7EB5 120'.4
192+ //
193+ // Based on that, in Unicode 7.0, there were 313 original-Unihan characters out of order.
194+ // The following assertion was put here as a trip wire, testing numOutOfOrder <= 320.
195+ //
196+ // In CLDR 46 (2024), we changed the sort order to conform to UAX #38,
197+ // demoting the simplified-ness of radicals to below the number of residual strokes.
198+ // For example, looking at the small sample above,
199+ // characters 7E9F..7EB5 sort before 7E97 because they have fewer residual strokes.
200+ // As a result, the improved radical-stroke order matches the
201+ // original-Unihan code point order less well, and
202+ // we got 1446 of these characters sorting differently.
203+ //
204+ // If this number is suddenly much higher, then either the data has changed a lot,
177205 // or there is a bug in the code.
178206 // Turn on the DEBUG flag and see if we can manually remove some characters from the set
179207 // so that a sequence of following ones does not get removed.
180- // TODO: Before changing the sort order to conform to UAX #38, demoting the simplified-ness
181- // of radicals to below the number of residual strokes,
182- // this successfully asserted numOutOfOrder <= 320.
183- // Find out if this is a known issue.
184208 assert numOutOfOrder <= 1500 ;
185- hanNotInCPOrder = new UnicodeSet (hanSet ).removeAll (hanInCPOrder ).freeze ();
209+ // Exclude simplifiedRadicals so that WriteConformanceTest omits those.
210+ // The test data should work with both implicit-han and radical-stroke orders.
211+ // CLDR 46 changes radical-stroke order to match UAX #38,
212+ // which intermingles characters with traditional and simplified radicals,
213+ // different from CLDR 26..45 where
214+ // simplified radicals strongly sorted after traditional ones.
215+ hanNotInCPOrder =
216+ new UnicodeSet (hanSet ).removeAll (hanInCPOrder ).addAll (simplifiedRadicals ).freeze ();
186217 }
187218
188219 // Triples of (start, end, extension) for coalesced UAX #38 order blocks.
@@ -487,6 +518,9 @@ private void getCJKRadicals(IndexUnicodeProperties iup) {
487518 int radicalChar = Integer .parseInt (parts [1 ], 16 );
488519 assert 0 < radicalChar ;
489520 assert radicalChar < 0x3000 ; // should be a radical code point
521+ if ((radicalNumberAndSimplified & 3 ) != 0 ) {
522+ simplifiedRadicals .add (radicalChar );
523+ }
490524 radToChar [radicalNumberAndSimplified ] =
491525 radicalCharString = Character .toString ((char ) radicalChar );
492526 // radToChar[] remains null if there is no radical character.
0 commit comments