Skip to content

Commit f305549

Browse files
hsivonenmarkusicu
authored andcommitted
ICU-23053 In the ICU4X mode of the collation builder, mark middle starters in contractions
1 parent b9b952c commit f305549

File tree

4 files changed

+97
-1
lines changed

4 files changed

+97
-1
lines changed
28 Bytes
Binary file not shown.
28 Bytes
Binary file not shown.

icu4c/source/i18n/collationdatabuilder.cpp

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,20 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
655655
return;
656656
}
657657
}
658+
int32_t sCount = s.countChar32();
659+
UChar32 sUtf32[32];
660+
int32_t sLen = s.toUTF32(sUtf32, 32, errorCode);
661+
if (sLen != sCount) {
662+
// If this error is ever reached, just increase the buffer
663+
// size above.
664+
errorCode = U_UNSUPPORTED_ERROR;
665+
return;
666+
}
667+
for (int32_t i = 1; i < sLen - 1; ++i) {
668+
if (u_getCombiningClass(sUtf32[i]) == 0) {
669+
contractionMiddleStarter.add(sUtf32[i]);
670+
}
671+
}
658672
}
659673
}
660674

@@ -697,6 +711,20 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
697711
UnicodeString suffix(s, cLength);
698712
UnicodeString context(static_cast<char16_t>(prefix.length()));
699713
context.append(prefix).append(suffix);
714+
if (icu4xMode && !suffix.isEmpty() && !prefix.isEmpty()) {
715+
// ICU4X does not support the combination of prefix and contraction.
716+
// This combination is supported by LDML but does not occur in the
717+
// root or any tailorings in CLDR as of February 2025.
718+
// If support for this case becomes necessary, a practical change
719+
// would be allocating a flag on prefix ce32 and setting the
720+
// flag on a prefix ce32 if any ce32 that can be found under
721+
// the prefix ce32 (either the default or any UCharsTrie value) is
722+
// a contraction ce32 or if the prefix ce32 is the utrie2 value
723+
// for a character that is a starter that occurs in a middle
724+
// (neither first nor last) position in a contraction.
725+
errorCode = U_UNSUPPORTED_ERROR;
726+
return;
727+
}
700728
unsafeBackwardSet.addAll(suffix);
701729
for(;;) {
702730
// invariant: context > cond->context
@@ -1391,7 +1419,69 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
13911419
setDigitTags(errorCode);
13921420
setLeadSurrogates(errorCode);
13931421

1394-
if (!icu4xMode) {
1422+
if (icu4xMode) {
1423+
// Make sure that starters that occur is the middle of a
1424+
// contraction have contraction ce32 with the
1425+
// `CONTRACT_HAS_STARTER` flag set so that starters that
1426+
// can occur in a non-final position in a contraction can
1427+
// be easily recognized from having a contraction ce32
1428+
// that has the `CONTRACT_HAS_STARTER` flag set.
1429+
1430+
UCharsTrieBuilder contractionBuilder(errorCode);
1431+
// Intentionally unpaired low surrogate to make it never
1432+
// match well-formed UTF-16 which ICU4X feeds to the
1433+
// matcher.
1434+
UnicodeString placeholder(0xDC00);
1435+
1436+
for (UChar32 c : contractionMiddleStarter.codePoints()) {
1437+
uint32_t ce32 = utrie2_get32(trie, c);
1438+
UBool fromBase = false;
1439+
if(ce32 == Collation::FALLBACK_CE32) {
1440+
fromBase = true;
1441+
ce32 = base->getCE32(c);
1442+
}
1443+
if (!(Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG) && (ce32 & Collation::CONTRACT_HAS_STARTER))) {
1444+
if (fromBase) {
1445+
// This case does not actually happen as of February 2025.
1446+
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
1447+
}
1448+
if (Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG)) {
1449+
// This middle starter is also the first character of another
1450+
// contraction, but that contraction does not have the
1451+
// CONTRACT_HAS_STARTER flag. Let's add the flag to
1452+
// mark this at the expense of pessimizing the matching
1453+
// of this contraction.
1454+
// As of February 2025, this case does not actually occur
1455+
// in CLDR.
1456+
ce32 |= Collation::CONTRACT_HAS_STARTER;
1457+
} else {
1458+
// This middle starter is not also the first character
1459+
// in another contraction.
1460+
1461+
// The UCharsTrie needs to contain some placeholder
1462+
// because it cannot be empty. We build a trie
1463+
// that never actually matches anything that ICU4X can try to
1464+
// match, since ICU4X always passes well-formed UTF-16 to the
1465+
// matcher and we put an unpaired low surrogate into the trie.
1466+
// This pessimizes the character to CE mapping of the `c`,
1467+
// since useless trie matching will be attempted but as of
1468+
// February 2025, only two relatively rare characters are affected.
1469+
contractionBuilder.clear();
1470+
contractionBuilder.add(placeholder, static_cast<int32_t>(ce32), errorCode);
1471+
1472+
int32_t index = addContextTrie(ce32, contractionBuilder, errorCode);
1473+
if(U_FAILURE(errorCode)) { return; }
1474+
if(index > Collation::MAX_INDEX) {
1475+
errorCode = U_BUFFER_OVERFLOW_ERROR;
1476+
return;
1477+
}
1478+
// Set CONTRACT_HAS_STARTER to make identical prefix matching able to catch this.
1479+
ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | Collation::CONTRACT_HAS_STARTER;
1480+
}
1481+
utrie2_set32(trie, c, ce32, &errorCode);
1482+
}
1483+
}
1484+
} else {
13951485
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
13961486
ce32s.setElementAt(static_cast<int32_t>(utrie2_get32(trie, 0)), 0);
13971487
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);

icu4c/source/i18n/collationdatabuilder.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,12 @@ class U_I18N_API CollationDataBuilder : public UObject {
254254
int32_t contextsEra = 0;
255255
protected:
256256
UnicodeSet unsafeBackwardSet;
257+
/**
258+
* For ICU4X only: The starters that occur in some contraction
259+
* in a position that is neither the first nor the last code point
260+
* of the contraction.
261+
*/
262+
UnicodeSet contractionMiddleStarter;
257263
UBool modified;
258264
UBool icu4xMode;
259265

0 commit comments

Comments
 (0)