Skip to content

Commit b147c94

Browse files
committed
remove seg2 cache, fallback to binary search
1 parent 882db6b commit b147c94

File tree

2 files changed

+22
-28
lines changed

2 files changed

+22
-28
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,775 | 12,519 | 5,297 | 3,933 | 4,985 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,506 | 12,465 | 5,268 | 3,956 | 4,955 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 26,260 | 13,720 |
234+
| `unicode-segmenter/grapheme` | 26,075 | 13,630 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -208,34 +208,33 @@ const
208208
/**
209209
* Segmented lookup tables for BMP code points.
210210
*
211-
* Memory optimization: Skip regions that are almost 100% category 0 (Any):
212-
* - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any -> inlined fast path
213-
* - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any -> inlined fast path
211+
* Memory optimization: Skip regions that are almost 100% category {@link GC_Any}:
212+
* - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any ranges -> need to be inlined
213+
* - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any range -> very rare, but quite simple to be inlined
214+
* - 0xFE00-0xFFFF (Specials): 512 codepoints, only 5 ranges -> very rare, fall back to binary search
214215
*
215216
* Cache segments:
216-
* - seg0: 0x0080-0x2FFF (12,160 bytes)
217-
* - seg1: 0xA000-0xDFFF (16,384 bytes)
218-
* - seg2: 0xFE00-0xFFFF (512 bytes)
217+
* - SEG0: 0x0080-0x2FFF (12,160 bytes)
218+
* - SEG1: 0xA000-0xDFFF (16,384 bytes)
219219
*
220-
* Total: 29,056 bytes (~28KB)
220+
* Total: 28,544 bytes (~28KB)
221221
*/
222-
let seg0 = new Uint8Array(SEG0_MAX - SEG0_MIN + 1);
223-
let seg1 = new Uint8Array(SEG1_MAX - SEG1_MIN + 1);
224-
let seg2 = new Uint8Array(BMP_MAX - SEG2_MIN + 1);
225-
let bmpCursor = (() => {
222+
let SEG0 = new Uint8Array(SEG0_MAX - SEG0_MIN + 1);
223+
let SEG1 = new Uint8Array(SEG1_MAX - SEG1_MIN + 1);
224+
let SEG_CURSOR = (() => {
226225
let cursor = 0;
227226
while (cursor < grapheme_ranges.length) {
228227
let [start, end, cat] = grapheme_ranges[cursor];
229-
if (start > BMP_MAX) break;
228+
if (start > SEG1_MAX) break;
230229
cursor++;
231230

232231
// Skip ranges outside segments (ASCII/CJK/PrivateUse fast paths)
233-
if (end < SEG0_MIN || (start > SEG0_MAX && end < SEG1_MIN) || (start > SEG1_MAX && end < SEG2_MIN)) continue;
232+
if (end < SEG0_MIN || (start > SEG0_MAX && end < SEG1_MIN)) continue;
234233

235-
for (let cp = start; cp <= end && cp <= BMP_MAX; cp++) {
236-
if (cp >= SEG0_MIN && cp <= SEG0_MAX) seg0[cp - SEG0_MIN] = cat;
237-
else if (cp >= SEG1_MIN && cp <= SEG1_MAX) seg1[cp - SEG1_MIN] = cat;
238-
else if (cp >= SEG2_MIN) seg2[cp - SEG2_MIN] = cat;
234+
for (let cp = start; cp <= end; cp++) {
235+
if (cp >= SEG0_MIN && cp <= SEG0_MAX) SEG0[cp - SEG0_MIN] = cat;
236+
else if (cp >= SEG1_MIN && cp <= SEG1_MAX) SEG1[cp - SEG1_MIN] = cat;
237+
else continue;
239238
}
240239
}
241240
return cursor;
@@ -256,8 +255,7 @@ function cat(cp) {
256255
// 3. CJK fast path
257256
// 4. Segment 1 cache
258257
// 5. PrivateUse fast path
259-
// 6. Segment 2 cache
260-
// 7. Non-BMP binary search
258+
// 7. Binary search
261259

262260
// ASCII fast path
263261
if (cp < SEG0_MIN) {
@@ -268,7 +266,7 @@ function cat(cp) {
268266
}
269267
// Segment 0
270268
if (cp <= SEG0_MAX) {
271-
return /** @type {GraphemeCategoryNum} */ (seg0[cp - SEG0_MIN]);
269+
return /** @type {GraphemeCategoryNum} */ (SEG0[cp - SEG0_MIN]);
272270
}
273271
// CJK fast path
274272
if (cp < SEG1_MIN) {
@@ -282,18 +280,14 @@ function cat(cp) {
282280
}
283281
// Segment 1
284282
if (cp <= SEG1_MAX) {
285-
return /** @type {GraphemeCategoryNum} */ (seg1[cp - SEG1_MIN]);
283+
return /** @type {GraphemeCategoryNum} */ (SEG1[cp - SEG1_MIN]);
286284
}
287285
// Private Use fast path
288286
if (cp < SEG2_MIN) {
289287
return cp === 0xFB1E ? 3 : 0;
290288
}
291-
// Segment 2
292-
if (cp <= BMP_MAX) {
293-
return /** @type {GraphemeCategoryNum} */ (seg2[cp - SEG2_MIN]);
294-
}
295289
// Non-BMP
296-
let idx = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
290+
let idx = findUnicodeRangeIndex(cp, grapheme_ranges, SEG_CURSOR);
297291
return idx < 0 ? 0 : grapheme_ranges[idx][2];
298292
}
299293

0 commit comments

Comments
 (0)