Skip to content

Commit 9d482aa

Browse files
authored
Inline grapheme boundary checking (#113)
1 parent e50d821 commit 9d482aa

File tree

3 files changed

+86
-113
lines changed

3 files changed

+86
-113
lines changed

.changeset/four-apples-show.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Inlined the grapheme boundary checking
6+
to avoid unnecessary function calls in the hotpath and consolidating internal state.
7+
8+
This achieved the runtime perf by 2% and a slight bundle size reduction.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,937 | 6,743 | 3,401 | 2,770 | 3,520 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,774 | 6,675 | 3,368 | 2,755 | 3,497 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 20,446 | 11,561 |
234+
| `unicode-segmenter/grapheme` | 20,295 | 11,420 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 76 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -59,52 +59,86 @@ export function* graphemeSegments(input) {
5959
/** Category of codepoint immediately preceding cursor */
6060
let catBefore = cat(cp);
6161

62-
/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */
63-
let catAfter = null;
62+
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
63+
let catAfter = 0;
6464

6565
/** The number of RIS codepoints preceding `cursor`. */
6666
let risCount = 0;
6767

68-
/** Emoji state */
68+
/**
69+
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
70+
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
71+
*/
6972
let emoji = false;
7073

71-
/** InCB=Consonant */
74+
/** InCB=Consonant - segment started with Indic consonant */
7275
let consonant = false;
7376

74-
/** InCB=Linker */
77+
/** InCB=Linker - seen a linker after consonant */
7578
let linker = false;
7679

77-
/** InCB=Consonant InCB=Linker x InCB=Consonant */
78-
let incb = false;
79-
8080
let index = 0;
8181

8282
/** Beginning category of a segment */
8383
let _catBegin = catBefore;
8484

85-
/** Memoize the beginnig code point of the segment. */
85+
/** Memoize the beginning code point of the segment. */
8686
let _hd = cp;
8787

8888
while (cursor < len) {
8989
cp = /** @type {number} */ (input.codePointAt(cursor));
9090
catAfter = cat(cp);
9191

92-
if (catBefore === 10 /* Regional_Indicator */) {
93-
risCount++;
94-
} else {
95-
risCount = 0;
96-
if (
97-
catAfter === 14 /* ZWJ */
98-
&& (catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */)
99-
) {
100-
emoji = true;
92+
let boundary = true;
10193

102-
} else if (catAfter === 0) {
103-
incb = consonant && linker && isIndicConjunctConsonant(cp);
104-
}
94+
// GB3: CR × LF
95+
if (catBefore === 1) {
96+
boundary = catAfter !== 6;
97+
}
98+
// GB4: (Control | CR | LF) ÷
99+
else if (catBefore === 2 || catBefore === 6) {
100+
boundary = true;
101+
}
102+
// GB5: ÷ (Control | CR | LF)
103+
else if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
104+
boundary = true;
105+
}
106+
// GB9, GB9a: × (Extend | ZWJ | SpacingMark) - most common no-break case
107+
else if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
108+
boundary = false;
109+
}
110+
// GB9b: Prepend ×
111+
else if (catBefore === 9) {
112+
boundary = false;
113+
}
114+
// GB11: ExtPic Extend* ZWJ × ExtPic
115+
else if (catBefore === 14 && catAfter === 4) {
116+
boundary = !emoji;
117+
}
118+
// GB12, GB13: RI × RI (odd count means no break)
119+
else if (catBefore === 10 && catAfter === 10) {
120+
// risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121+
boundary = risCount++ % 2 === 1;
122+
}
123+
// GB6: L × (L | V | LV | LVT)
124+
else if (catBefore === 5) {
125+
boundary = !(catAfter === 5 || catAfter === 13 || catAfter === 7 || catAfter === 8);
105126
}
127+
// GB7: (LV | V) × (V | T)
128+
else if ((catBefore === 7 || catBefore === 13) && (catAfter === 13 || catAfter === 12)) {
129+
boundary = false;
130+
}
131+
// GB8: (LVT | T) × T
132+
else if ((catBefore === 8 || catBefore === 12) && catAfter === 12) {
133+
boundary = false;
134+
}
135+
// GB9c: InCB=Consonant InCB=Extend* InCB=Linker InCB=Extend* × InCB=Consonant
136+
else if (catAfter === 0 && consonant && linker && isIndicConjunctConsonant(cp)) {
137+
boundary = false;
138+
}
139+
// else GB999: ÷ Any
106140

107-
if (isBoundary(catBefore, catAfter, risCount, emoji, incb)) {
141+
if (boundary) {
108142
yield {
109143
segment: input.slice(index, cursor),
110144
index,
@@ -114,23 +148,30 @@ export function* graphemeSegments(input) {
114148
_catEnd: catBefore,
115149
};
116150

117-
// flush
151+
// Reset segment state
118152
emoji = false;
119-
incb = false;
153+
risCount = 0;
120154
index = cursor;
121155
_catBegin = catAfter;
122156
_hd = cp;
123-
124-
} else if (cp >= 2325) {
125-
// Note: Avoid InCB state checking much as possible
126-
// Update InCB state only when continuing within a segment
127-
if (!consonant && catBefore === 0)
128-
consonant = isIndicConjunctConsonant(_hd);
129-
130-
if (consonant && catAfter === 3)
131-
linker = isIndicConjunctLinker(cp);
132-
else if (catAfter === 0)
133-
linker = false;
157+
}
158+
// Update state for continuing segment
159+
else {
160+
// emoji state for GB11
161+
if (catAfter === 14 && (catBefore === 3 || catBefore === 4)) {
162+
emoji = true;
163+
}
164+
// InCB state for GB9c
165+
else if (cp >= 2325) {
166+
if (!consonant && catBefore === 0) {
167+
consonant = isIndicConjunctConsonant(_hd);
168+
}
169+
if (consonant && catAfter === 3) {
170+
linker = linker || isIndicConjunctLinker(cp);
171+
} else {
172+
linker = false;
173+
}
174+
}
134175
}
135176

136177
cursor += cp <= BMP_MAX ? 1 : 2;
@@ -313,79 +354,3 @@ function isIndicConjunctLinker(cp) {
313354
cp === 3405 /* 0x0D4D */
314355
);
315356
}
316-
317-
/**
318-
* @param {GraphemeCategoryNum} catBefore
319-
* @param {GraphemeCategoryNum} catAfter
320-
* @param {number} risCount Regional_Indicator state
321-
* @param {boolean} emoji Extended_Pictographic state
322-
* @param {boolean} incb Indic_Conjunct_Break state
323-
* @return {boolean}
324-
*
325-
* @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules
326-
*/
327-
function isBoundary(catBefore, catAfter, risCount, emoji, incb) {
328-
// GB3
329-
if (catBefore === 1 && catAfter === 6) {
330-
return false;
331-
}
332-
333-
// GB4
334-
if (catBefore === 1 || catBefore === 2 || catBefore === 6) {
335-
return true;
336-
}
337-
338-
// GB5
339-
if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
340-
return true;
341-
}
342-
343-
// Most common cases - GB9, GB9a extend rules
344-
if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
345-
return false;
346-
}
347-
348-
// GB6 - L x (L | V | LV | LVT)
349-
if (catBefore === 5) {
350-
return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13);
351-
}
352-
353-
// GB7 - (LV | V) x (V | T)
354-
if (
355-
(catBefore === 7 || catBefore === 13) &&
356-
(catAfter === 13 || catAfter === 12)
357-
) {
358-
return false;
359-
}
360-
361-
// GB8 - (LVT | T) x T
362-
if (
363-
(catBefore === 8 || catBefore === 12) &&
364-
catAfter === 12
365-
) {
366-
return false;
367-
}
368-
369-
// GB9b
370-
if (catBefore === 9) {
371-
return false;
372-
}
373-
374-
// GB9c
375-
if (catAfter === 0 && incb) {
376-
return false;
377-
}
378-
379-
// GB11
380-
if (catBefore === 14 && catAfter === 4) {
381-
return !emoji;
382-
}
383-
384-
// GB12, GB13
385-
if (catBefore === 10 && catAfter === 10) {
386-
return risCount % 2 === 0;
387-
}
388-
389-
// GB999
390-
return true;
391-
}

0 commit comments

Comments
 (0)