Skip to content

Commit 65c38ce

Browse files
authored
Move InCB pattern lookup to bottom (#106)
Same logic, but update the InCB state only when it's not the beginning of a segment.
1 parent 298d510 commit 65c38ce

File tree

3 files changed

+24
-19
lines changed

3 files changed

+24
-19
lines changed

.changeset/shiny-buttons-bathe.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Move GB9c rule checking to be _after_ the main boundary checking.
6+
To try to avoid unnecessary work as much as possible.
7+
8+
No noticeable changes, but perf seems to be improved by ~2% for most cases.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,704 | 12,554 | 5,308 | 3,958 | 5,010 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,685 | 12,549 | 5,314 | 3,952 | 5,012 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 26,309 | 13,811 |
234+
| `unicode-segmenter/grapheme` | 26,278 | 13,797 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,10 @@ export function* graphemeSegments(input) {
8282
/** Beginning category of a segment */
8383
let _catBegin = catBefore;
8484

85-
/** Memoize the beginnig code point a the segment. */
85+
/** Memoize the beginnig code point of the segment. */
8686
let _hd = cp;
8787

8888
while (cursor < len) {
89-
// Note: Lazily update `consonant` and `linker` state
90-
// which is a extra overhead only for Hindi text.
91-
if (cp >= 2325) {
92-
if (!consonant && catBefore === 0) {
93-
consonant = isIndicConjunctConsonant(cp);
94-
} else if (catBefore === 3 /* Extend */) {
95-
// Note: \p{InCB=Linker} is a subset of \p{Extend}
96-
linker = isIndicConjunctLinker(cp);
97-
}
98-
}
99-
10089
cp = /** @type {number} */ (input.codePointAt(cursor));
10190
catAfter = cat(cp);
10291

@@ -110,11 +99,8 @@ export function* graphemeSegments(input) {
11099
) {
111100
emoji = true;
112101

113-
} else if (catAfter === 0 /* Any */ && cp >= 2325) {
114-
// Note: Put GB9c rule checking here to reduce.
115-
incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp));
116-
// It cannot be both a linker and a consonant.
117-
linker = linker && !consonant;
102+
} else if (catAfter === 0) {
103+
incb = consonant && linker && isIndicConjunctConsonant(cp);
118104
}
119105
}
120106

@@ -134,6 +120,17 @@ export function* graphemeSegments(input) {
134120
index = cursor;
135121
_catBegin = catAfter;
136122
_hd = cp;
123+
124+
} else if (cp >= 2325) {
125+
// Note: Avoid InCB state checking much as possible
126+
// Update InCB state only when continuing within a segment
127+
if (!consonant && catBefore === 0)
128+
consonant = isIndicConjunctConsonant(_hd);
129+
130+
if (consonant && catAfter === 3)
131+
linker = isIndicConjunctLinker(cp);
132+
else if (catAfter === 0)
133+
linker = false;
137134
}
138135

139136
cursor += cp <= BMP_MAX ? 1 : 2;

0 commit comments

Comments
 (0)