Skip to content

Commit 0b9eb9c

Browse files
committed
ICU-22956 Use InCB for grapheme cluster segmentation
1 parent 700c5e3 commit 0b9eb9c

File tree

8 files changed

+60
-88
lines changed

8 files changed

+60
-88
lines changed

icu4c/source/data/brkitr/rules/char.txt

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
2424
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
2525
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
2626

27-
#
28-
# From cldr/common/properties/segments/
29-
# and issue CLDR-10994
30-
#
31-
$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
32-
$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
33-
$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
27+
$InCBConsonant = [\p{InCB=Consonant}];
28+
$InCBExtend = [\p{InCB=Extend}];
29+
$InCBLinker = [\p{InCB=Linker}];
3430

3531
# Korean Syllable Definitions
3632
#
@@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT);
6460
# GB 9b
6561
$Prepend [^$Control $CR $LF];
6662

67-
# GB 9.3, from CLDR-10994
68-
$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
63+
# GB 9c
64+
$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant;
6965

7066
# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
7167
$Extended_Pict $Extend* $ZWJ $Extended_Pict;

icu4c/source/test/intltest/rbbitst.cpp

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind {
16551655
UnicodeSet *fLVTSet;
16561656
UnicodeSet *fHangulSet;
16571657
UnicodeSet *fExtendedPictSet;
1658-
UnicodeSet *fViramaSet;
1659-
UnicodeSet *fLinkingConsonantSet;
1660-
UnicodeSet *fExtCccZwjSet;
1658+
UnicodeSet *fInCBLinkerSet;
1659+
UnicodeSet *fInCBConsonantSet;
1660+
UnicodeSet *fInCBExtendSet;
16611661
UnicodeSet *fAnySet;
16621662

16631663
const UnicodeString *fText;
@@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() {
16901690
fHangulSet->addAll(*fLVTSet);
16911691

16921692
fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1693-
fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1694-
"\\p{Indic_Syllabic_Category=Virama}]", status);
1695-
fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1696-
"\\p{Indic_Syllabic_Category=Consonant}]", status);
1697-
fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1693+
fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status);
1694+
fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status);
1695+
fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status);
16981696
fAnySet = new UnicodeSet(0, 0x10ffff);
16991697

17001698
// Create sets of characters, and add the names of the above character sets.
@@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() {
17131711
sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul");
17141712
sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ");
17151713
sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict");
1716-
sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama");
1717-
sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant");
1718-
sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj");
1714+
sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker");
1715+
sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant");
1716+
sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend");
17191717
sets.emplace_back(*fAnySet); classNames.emplace_back("Any");
17201718

17211719
if (U_FAILURE(status)) {
@@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
18381836
continue;
18391837
}
18401838

1841-
// Note: Viramas are also included in the ExtCccZwj class.
1842-
if (fLinkingConsonantSet->contains(c2)) {
1839+
if (fInCBConsonantSet->contains(c2)) {
18431840
int pi = p1;
18441841
bool sawVirama = false;
1845-
while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1846-
if (fViramaSet->contains(fText->char32At(pi))) {
1842+
while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) ||
1843+
fInCBLinkerSet->contains(fText->char32At(pi)))) {
1844+
if (fInCBLinkerSet->contains(fText->char32At(pi))) {
18471845
sawVirama = true;
18481846
}
18491847
pi = fText->moveIndex32(pi, -1);
18501848
}
1851-
if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1852-
setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
1853-
continue;
1849+
if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) {
1850+
setAppliedRule(
1851+
p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})");
1852+
continue;
18541853
}
18551854
}
18561855

@@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() {
19031902
delete fAnySet;
19041903
delete fZWJSet;
19051904
delete fExtendedPictSet;
1906-
delete fViramaSet;
1907-
delete fLinkingConsonantSet;
1908-
delete fExtCccZwjSet;
1905+
delete fInCBLinkerSet;
1906+
delete fInCBConsonantSet;
1907+
delete fInCBExtendSet;
19091908
}
19101909

19111910
//------------------------------------------------------------------------------------------

icu4c/source/test/testdata/break_rules/grapheme.txt

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
1818
LF = [\p{Grapheme_Cluster_Break = LF}];
1919

2020
Control = [[\p{Grapheme_Cluster_Break = Control}]];
21-
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
21+
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
2222
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
2323
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
2424
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
@@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
3838
Extended_Pict = [:ExtPict:];
3939

4040
# Indic Sequences
41-
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
42-
43-
LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
44-
45-
ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
41+
InCBLinker = [\p{InCB=Linker}];
42+
InCBConsonant = [\p{InCB=Consonant}];
43+
InCBExtend = [\p{InCB=Extend}];
4644

4745
GB3: CR LF;
4846
GB4: (Control | CR | LF) ÷;
@@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
5250
GB7: (LV | V) (V | T);
5351
GB8: (LVT | T) T;
5452

55-
GB11: Extended_Pict Extend* ZWJ Extended_Pict;
56-
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
57-
GB9: . (Extend | ZWJ);
53+
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
54+
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
55+
GB9: . (Extend_ | ZWJ);
5856

5957
GB9a: . SpacingMark;
6058
GB9b: Prepend .;

icu4c/source/test/testdata/rbbitst.txt

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,9 @@
169169
#
170170
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
171171

172-
#
173-
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
174-
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
175-
# Sample Chars: LinkingConsonant: \u0915
176-
# Virama: \u094d [also Extend]
177-
# ExtCccZWJ: \u0308
178-
# Extend but not ExtCCCZWJ \u093A
179-
<char>
180-
<data>•\u0915\u094d\u0915•</data>
181-
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
182-
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
183-
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
172+
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
173+
# This test would have caught ICU-22956.
174+
<data>•સૻ્સૻ•</data>
184175

185176
#
186177
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
Binary file not shown.

icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
145145
UnicodeSet fHangulSet;
146146
UnicodeSet fZWJSet;
147147
UnicodeSet fExtendedPictSet;
148-
UnicodeSet fViramaSet;
149-
UnicodeSet fLinkingConsonantSet;
150-
UnicodeSet fExtCccZwjSet;
148+
UnicodeSet fInCBLinkerSet;
149+
UnicodeSet fInCBConsonantSet;
150+
UnicodeSet fInCBExtendSet;
151151
UnicodeSet fAnySet;
152152

153153

@@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
176176
fHangulSet.addAll(fLVTSet);
177177

178178
fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
179-
fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
180-
+ "\\p{Indic_Syllabic_Category=Virama}]");
181-
fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
182-
+ "\\p{Indic_Syllabic_Category=Consonant}]");
183-
fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
179+
fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]");
180+
fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]");
181+
fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]");
184182
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
185183

186184

@@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
196194
fSets.add(fAnySet); fClassNames.add("Any");
197195
fSets.add(fZWJSet); fClassNames.add("ZWJ");
198196
fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict");
199-
fSets.add(fViramaSet); fClassNames.add("Virama");
200-
fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant");
201-
fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj");
197+
fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker");
198+
fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant");
199+
fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend");
202200
}
203201

204202

@@ -315,17 +313,18 @@ int next(int prevPos) {
315313
}
316314

317315
// Note: Viramas are also included in the ExtCccZwj class.
318-
if (fLinkingConsonantSet.contains(c2)) {
316+
if (fInCBConsonantSet.contains(c2)) {
319317
int pi = p1;
320318
boolean sawVirama = false;
321-
while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
322-
if (fViramaSet.contains(fText.codePointAt(pi))) {
319+
while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) ||
320+
fInCBLinkerSet.contains(fText.codePointAt(pi)))) {
321+
if (fInCBLinkerSet.contains(fText.codePointAt(pi))) {
323322
sawVirama = true;
324323
}
325324
pi = fText.offsetByCodePoints(pi, -1);
326325
}
327-
if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
328-
setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
326+
if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) {
327+
setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})");
329328
continue;
330329
}
331330
}

icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
1818
LF = [\p{Grapheme_Cluster_Break = LF}];
1919

2020
Control = [[\p{Grapheme_Cluster_Break = Control}]];
21-
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
21+
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
2222
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
2323
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
2424
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
@@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
3838
Extended_Pict = [:ExtPict:];
3939

4040
# Indic Sequences
41-
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
42-
43-
LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
44-
45-
ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
41+
InCBLinker = [\p{InCB=Linker}];
42+
InCBConsonant = [\p{InCB=Consonant}];
43+
InCBExtend = [\p{InCB=Extend}];
4644

4745
GB3: CR LF;
4846
GB4: (Control | CR | LF) ÷;
@@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
5250
GB7: (LV | V) (V | T);
5351
GB8: (LVT | T) T;
5452

55-
GB11: Extended_Pict Extend* ZWJ Extended_Pict;
56-
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
57-
GB9: . (Extend | ZWJ);
53+
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
54+
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
55+
GB9: . (Extend_ | ZWJ);
5856

5957
GB9a: . SpacingMark;
6058
GB9b: Prepend .;

icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,9 @@
169169
#
170170
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
171171

172-
#
173-
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
174-
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
175-
# Sample Chars: LinkingConsonant: \u0915
176-
# Virama: \u094d [also Extend]
177-
# ExtCccZWJ: \u0308
178-
# Extend but not ExtCCCZWJ \u093A
179-
<char>
180-
<data>•\u0915\u094d\u0915•</data>
181-
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
182-
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
183-
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
172+
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
173+
# This test would have caught ICU-22956.
174+
<data>•સૻ્સૻ•</data>
184175

185176
#
186177
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt

0 commit comments

Comments
 (0)