Skip to content

Commit 5b3c150

Browse files
committed
reorg internal state
1 parent d737dfe commit 5b3c150

File tree

2 files changed

+23
-24
lines changed

2 files changed

+23
-24
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,708 | 6,659 | 3,363 | 2,739 | 3,490 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,725 | 6,650 | 3,364 | 2,732 | 3,485 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 20,259 | 11,417 |
234+
| `unicode-segmenter/grapheme` | 20,229 | 11,393 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,47 +45,47 @@ const BMP_MAX = 0xFFFF;
4545
* @return {GraphemeSegmenter} iterator for grapheme cluster segments
4646
*/
4747
export function* graphemeSegments(input) {
48-
let cp = input.codePointAt(0);
49-
50-
// do nothing on empty string
51-
if (cp == null) return;
52-
53-
/** Current cursor position. */
54-
let cursor = cp <= BMP_MAX ? 1 : 2;
55-
5648
/** Total length of the input string. */
5749
let len = input.length;
5850

59-
/** Category of codepoint immediately preceding cursor */
60-
let catBefore = cat(cp);
51+
// do nothing on empty string
52+
if (len === 0) return;
6153

62-
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
63-
let catAfter = 0;
54+
let cp = /** @type {number}*/ (input.codePointAt(0));
6455

65-
/** The number of RIS codepoints preceding `cursor`. */
66-
let risCount = 0;
56+
/** Memoize the beginning code point of the segment. */
57+
let _hd = cp;
6758

6859
/**
6960
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
7061
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
7162
*/
7263
let emoji = false;
7364

65+
/** The number of RI codepoints preceding `cursor`. */
66+
let riCount = 0;
67+
7468
/** InCB=Consonant - segment started with Indic consonant */
7569
let consonant = false;
7670

7771
/** InCB=Linker - seen a linker after consonant */
7872
let linker = false;
7973

80-
let index = 0;
74+
/** Category of codepoint immediately preceding cursor */
75+
let catBefore = cat(cp);
8176

82-
/** Beginning category of a segment */
77+
/** Memoize the beginning category of the segment */
8378
let _catBegin = catBefore;
8479

85-
/** Memoize the beginning code point of the segment. */
86-
let _hd = cp;
80+
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
81+
let catAfter = 0;
82+
83+
let index = 0;
84+
let cursor = 0;
8785

8886
while (cursor < len) {
87+
cursor += cp <= BMP_MAX ? 1 : 2;
88+
8989
cp = /** @type {number} */ (input.codePointAt(cursor));
9090
catAfter = cat(cp);
9191

@@ -117,8 +117,8 @@ export function* graphemeSegments(input) {
117117
}
118118
// GB12, GB13: RI × RI (odd count means no break)
119119
else if (catBefore === 10 && catAfter === 10) {
120-
// risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121-
boundary = risCount++ % 2 === 1;
120+
// riCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121+
boundary = riCount++ % 2 === 1;
122122
}
123123
// GB6: L × (L | V | LV | LVT)
124124
else if (catBefore === 5) {
@@ -150,7 +150,7 @@ export function* graphemeSegments(input) {
150150

151151
// Reset segment state
152152
emoji = false;
153-
risCount = 0;
153+
riCount = 0;
154154
index = cursor;
155155
_catBegin = catAfter;
156156
_hd = cp;
@@ -181,7 +181,6 @@ export function* graphemeSegments(input) {
181181
}
182182
}
183183

184-
cursor += cp <= BMP_MAX ? 1 : 2;
185184
catBefore = catAfter;
186185
}
187186

0 commit comments

Comments
 (0)