Skip to content

Commit d7da03d

Browse files
committed
inline boundary checking
1 parent e50d821 commit d7da03d

File tree

1 file changed

+82
-110
lines changed

1 file changed

+82
-110
lines changed

src/grapheme.js

Lines changed: 82 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -59,52 +59,94 @@ export function* graphemeSegments(input) {
5959
/** Category of codepoint immediately preceding cursor */
6060
let catBefore = cat(cp);
6161

62-
/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */
63-
let catAfter = null;
62+
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
63+
let catAfter = 0;
6464

6565
/** The number of RIS codepoints preceding `cursor`. */
6666
let risCount = 0;
6767

68-
/** Emoji state */
68+
/**
69+
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
70+
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
71+
*/
6972
let emoji = false;
7073

71-
/** InCB=Consonant */
74+
/** InCB=Consonant - segment started with Indic consonant */
7275
let consonant = false;
7376

74-
/** InCB=Linker */
77+
/** InCB=Linker - seen a linker after consonant */
7578
let linker = false;
7679

77-
/** InCB=Consonant InCB=Linker x InCB=Consonant */
78-
let incb = false;
79-
8080
let index = 0;
8181

8282
/** Beginning category of a segment */
8383
let _catBegin = catBefore;
8484

85-
/** Memoize the beginnig code point of the segment. */
85+
/** Memoize the beginning code point of the segment. */
8686
let _hd = cp;
8787

8888
while (cursor < len) {
8989
cp = /** @type {number} */ (input.codePointAt(cursor));
9090
catAfter = cat(cp);
9191

92-
if (catBefore === 10 /* Regional_Indicator */) {
93-
risCount++;
94-
} else {
95-
risCount = 0;
96-
if (
97-
catAfter === 14 /* ZWJ */
98-
&& (catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */)
99-
) {
100-
emoji = true;
92+
let boundary = true;
10193

102-
} else if (catAfter === 0) {
103-
incb = consonant && linker && isIndicConjunctConsonant(cp);
94+
// GB3: CR × LF
95+
if (catBefore === 1) {
96+
boundary = catAfter !== 6;
97+
// GB4 is implicit: CR breaks unless followed by LF
98+
}
99+
// GB4: (Control | CR | LF) ÷
100+
else if (catBefore === 2 || catBefore === 6) {
101+
boundary = true;
102+
}
103+
// GB5: ÷ (Control | CR | LF)
104+
else if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
105+
boundary = true;
106+
}
107+
// GB9, GB9a: × (Extend | ZWJ | SpacingMark) - most common no-break case
108+
else if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
109+
boundary = false;
110+
// Update emoji state for GB11: track Extend/ExtPic followed by ZWJ
111+
if (catAfter === 14 && (catBefore === 3 || catBefore === 4)) {
112+
emoji = true;
104113
}
105114
}
115+
// GB9b: Prepend ×
116+
else if (catBefore === 9) {
117+
boundary = false;
118+
}
119+
// GB11: ExtPic Extend* ZWJ × ExtPic
120+
else if (catBefore === 14 && catAfter === 4) {
121+
boundary = !emoji;
122+
// emoji state consumed, will be reset on boundary
123+
}
124+
// GB12, GB13: RI × RI (odd count means no break)
125+
else if (catBefore === 10 && catAfter === 10) {
126+
// risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
127+
boundary = risCount % 2 === 1;
128+
}
129+
// GB6: L × (L | V | LV | LVT)
130+
else if (catBefore === 5) {
131+
boundary = !(catAfter === 5 || catAfter === 13 || catAfter === 7 || catAfter === 8);
132+
}
133+
// GB7: (LV | V) × (V | T)
134+
else if ((catBefore === 7 || catBefore === 13) && (catAfter === 13 || catAfter === 12)) {
135+
boundary = false;
136+
}
137+
// GB8: (LVT | T) × T
138+
else if ((catBefore === 8 || catBefore === 12) && catAfter === 12) {
139+
boundary = false;
140+
}
141+
// GB9c: InCB=Consonant InCB=Extend* InCB=Linker InCB=Extend* × InCB=Consonant
142+
else if (catAfter === 0 && consonant && linker && isIndicConjunctConsonant(cp)) {
143+
boundary = false;
144+
linker = false;
145+
consonant = false;
146+
}
147+
// else GB999: ÷ Any
106148

107-
if (isBoundary(catBefore, catAfter, risCount, emoji, incb)) {
149+
if (boundary) {
108150
yield {
109151
segment: input.slice(index, cursor),
110152
index,
@@ -114,23 +156,29 @@ export function* graphemeSegments(input) {
114156
_catEnd: catBefore,
115157
};
116158

117-
// flush
159+
// Reset segment state
118160
emoji = false;
119-
incb = false;
161+
risCount = 0;
120162
index = cursor;
121163
_catBegin = catAfter;
122164
_hd = cp;
123-
124-
} else if (cp >= 2325) {
125-
// Note: Avoid InCB state checking much as possible
126-
// Update InCB state only when continuing within a segment
127-
if (!consonant && catBefore === 0)
128-
consonant = isIndicConjunctConsonant(_hd);
129-
130-
if (consonant && catAfter === 3)
131-
linker = isIndicConjunctLinker(cp);
132-
else if (catAfter === 0)
133-
linker = false;
165+
} else {
166+
// Update state for continuing segment
167+
168+
// RI counting for GB12/13
169+
if (catBefore === 10) {
170+
risCount++;
171+
}
172+
173+
// InCB state for GB9c (only for Indic scripts, cp >= 2325)
174+
if (cp >= 2325) {
175+
if (!consonant && catBefore === 0) {
176+
consonant = isIndicConjunctConsonant(_hd);
177+
}
178+
if (consonant && catAfter === 3) {
179+
linker = linker || isIndicConjunctLinker(cp);
180+
}
181+
}
134182
}
135183

136184
cursor += cp <= BMP_MAX ? 1 : 2;
@@ -313,79 +361,3 @@ function isIndicConjunctLinker(cp) {
313361
cp === 3405 /* 0x0D4D */
314362
);
315363
}
316-
317-
/**
318-
* @param {GraphemeCategoryNum} catBefore
319-
* @param {GraphemeCategoryNum} catAfter
320-
* @param {number} risCount Regional_Indicator state
321-
* @param {boolean} emoji Extended_Pictographic state
322-
* @param {boolean} incb Indic_Conjunct_Break state
323-
* @return {boolean}
324-
*
325-
* @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules
326-
*/
327-
function isBoundary(catBefore, catAfter, risCount, emoji, incb) {
328-
// GB3
329-
if (catBefore === 1 && catAfter === 6) {
330-
return false;
331-
}
332-
333-
// GB4
334-
if (catBefore === 1 || catBefore === 2 || catBefore === 6) {
335-
return true;
336-
}
337-
338-
// GB5
339-
if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
340-
return true;
341-
}
342-
343-
// Most common cases - GB9, GB9a extend rules
344-
if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
345-
return false;
346-
}
347-
348-
// GB6 - L x (L | V | LV | LVT)
349-
if (catBefore === 5) {
350-
return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13);
351-
}
352-
353-
// GB7 - (LV | V) x (V | T)
354-
if (
355-
(catBefore === 7 || catBefore === 13) &&
356-
(catAfter === 13 || catAfter === 12)
357-
) {
358-
return false;
359-
}
360-
361-
// GB8 - (LVT | T) x T
362-
if (
363-
(catBefore === 8 || catBefore === 12) &&
364-
catAfter === 12
365-
) {
366-
return false;
367-
}
368-
369-
// GB9b
370-
if (catBefore === 9) {
371-
return false;
372-
}
373-
374-
// GB9c
375-
if (catAfter === 0 && incb) {
376-
return false;
377-
}
378-
379-
// GB11
380-
if (catBefore === 14 && catAfter === 4) {
381-
return !emoji;
382-
}
383-
384-
// GB12, GB13
385-
if (catBefore === 10 && catAfter === 10) {
386-
return risCount % 2 === 0;
387-
}
388-
389-
// GB999
390-
return true;
391-
}

0 commit comments

Comments
 (0)