@@ -655,6 +655,20 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
655
655
return ;
656
656
}
657
657
}
658
+ int32_t sCount = s.countChar32 ();
659
+ UChar32 sUtf32 [32 ];
660
+ int32_t sLen = s.toUTF32 (sUtf32 , 32 , errorCode);
661
+ if (sLen != sCount ) {
662
+ // If this error is ever reached, just increase the buffer
663
+ // size above.
664
+ errorCode = U_UNSUPPORTED_ERROR;
665
+ return ;
666
+ }
667
+ for (int32_t i = 1 ; i < sLen - 1 ; ++i) {
668
+ if (u_getCombiningClass (sUtf32 [i]) == 0 ) {
669
+ contractionMiddleStarter.add (sUtf32 [i]);
670
+ }
671
+ }
658
672
}
659
673
}
660
674
@@ -697,6 +711,20 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
697
711
UnicodeString suffix (s, cLength);
698
712
UnicodeString context (static_cast <char16_t >(prefix.length ()));
699
713
context.append (prefix).append (suffix);
714
+ if (icu4xMode && !suffix.isEmpty () && !prefix.isEmpty ()) {
715
+ // ICU4X does not support the combination of prefix and contraction.
716
+ // This combination is supported by LDML but does not occur in the
717
+ // root or any tailorings in CLDR as of February 2025.
718
+ // If support for this case becomes necessary, a practical change
719
+ // would be allocating a flag on prefix ce32 and setting the
720
+ // flag on a prefix ce32 if any ce32 that can be found under
721
+ // the prefix ce32 (either the default or any UCharsTrie value) is
722
+ // a contraction ce32 or if the prefix ce32 is the utrie2 value
723
+ // for a character that is a starter that occurs in a middle
724
+ // (neither first nor last) position in a contraction.
725
+ errorCode = U_UNSUPPORTED_ERROR;
726
+ return ;
727
+ }
700
728
unsafeBackwardSet.addAll (suffix);
701
729
for (;;) {
702
730
// invariant: context > cond->context
@@ -1391,7 +1419,69 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
1391
1419
setDigitTags (errorCode);
1392
1420
setLeadSurrogates (errorCode);
1393
1421
1394
- if (!icu4xMode) {
1422
+ if (icu4xMode) {
1423
+ // Make sure that starters that occur is the middle of a
1424
+ // contraction have contraction ce32 with the
1425
+ // `CONTRACT_HAS_STARTER` flag set so that starters that
1426
+ // can occur in a non-final position in a contraction can
1427
+ // be easily recognized from having a contraction ce32
1428
+ // that has the `CONTRACT_HAS_STARTER` flag set.
1429
+
1430
+ UCharsTrieBuilder contractionBuilder (errorCode);
1431
+ // Intentionally unpaired low surrogate to make it never
1432
+ // match well-formed UTF-16 which ICU4X feeds to the
1433
+ // matcher.
1434
+ UnicodeString placeholder (0xDC00 );
1435
+
1436
+ for (UChar32 c : contractionMiddleStarter.codePoints ()) {
1437
+ uint32_t ce32 = utrie2_get32 (trie, c);
1438
+ UBool fromBase = false ;
1439
+ if (ce32 == Collation::FALLBACK_CE32) {
1440
+ fromBase = true ;
1441
+ ce32 = base->getCE32 (c);
1442
+ }
1443
+ if (!(Collation::hasCE32Tag (ce32, Collation::CONTRACTION_TAG) && (ce32 & Collation::CONTRACT_HAS_STARTER))) {
1444
+ if (fromBase) {
1445
+ // This case does not actually happen as of February 2025.
1446
+ ce32 = copyFromBaseCE32 (c, ce32, true , errorCode);
1447
+ }
1448
+ if (Collation::hasCE32Tag (ce32, Collation::CONTRACTION_TAG)) {
1449
+ // This middle starter is also the first character of another
1450
+ // contraction, but that contraction does not have the
1451
+ // CONTRACT_HAS_STARTER flag. Let's add the flag to
1452
+ // mark this at the expense of pessimizing the matching
1453
+ // of this contraction.
1454
+ // As of February 2025, this case does not actually occur
1455
+ // in CLDR.
1456
+ ce32 |= Collation::CONTRACT_HAS_STARTER;
1457
+ } else {
1458
+ // This middle starter is not also the first character
1459
+ // in another contraction.
1460
+
1461
+ // The UCharsTrie needs to contain some placeholder
1462
+ // because it cannot be empty. We build a trie
1463
+ // that never actually matches anything that ICU4X can try to
1464
+ // match, since ICU4X always passes well-formed UTF-16 to the
1465
+ // matcher and we put an unpaired low surrogate into the trie.
1466
+ // This pessimizes the character to CE mapping of the `c`,
1467
+ // since useless trie matching will be attempted but as of
1468
+ // February 2025, only two relatively rare characters are affected.
1469
+ contractionBuilder.clear ();
1470
+ contractionBuilder.add (placeholder, static_cast <int32_t >(ce32), errorCode);
1471
+
1472
+ int32_t index = addContextTrie (ce32, contractionBuilder, errorCode);
1473
+ if (U_FAILURE (errorCode)) { return ; }
1474
+ if (index > Collation::MAX_INDEX) {
1475
+ errorCode = U_BUFFER_OVERFLOW_ERROR;
1476
+ return ;
1477
+ }
1478
+ // Set CONTRACT_HAS_STARTER to make identical prefix matching able to catch this.
1479
+ ce32 = Collation::makeCE32FromTagAndIndex (Collation::CONTRACTION_TAG, index) | Collation::CONTRACT_HAS_STARTER;
1480
+ }
1481
+ utrie2_set32 (trie, c, ce32, &errorCode);
1482
+ }
1483
+ }
1484
+ } else {
1395
1485
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1396
1486
ce32s.setElementAt (static_cast <int32_t >(utrie2_get32 (trie, 0 )), 0 );
1397
1487
utrie2_set32 (trie, 0 , Collation::makeCE32FromTagAndIndex (Collation::U0000_TAG, 0 ), &errorCode);
0 commit comments