Skip to content

Commit b7a6e12

Browse files
authored
Optimizing category lookup + bit-packing for cache (#101)
Optimizing grapheme break category lookup for better runtime trade-offs. See [issue](#104) for the explanation. Closes #102, #103 as duplicated Resolves #104
1 parent 5cb8e83 commit b7a6e12

File tree

3 files changed

+91
-22
lines changed

3 files changed

+91
-22
lines changed

.changeset/five-seas-enjoy.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Optimizing grapheme break category lookup for better runtime trade-offs.
6+
7+
See [issue](https://github.com/cometkim/unicode-segmenter/issues/104) for the explanation.
8+
9+
With this change, the library's constant memory footprint is reduced from 64 KB to 14 KB without performance regressions.
10+
However, the code size increases slightly due to inlining. It's still relatively small.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,449 | 12,108 | 5,060 | 3,738 | 4,744 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,752 | 12,562 | 5,308 | 3,968 | 5,013 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 21,372 | 11,328 |
234+
| `unicode-segmenter/grapheme` | 26,309 | 13,811 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -193,21 +193,54 @@ export function* splitGraphemes(text) {
193193
for (let s of graphemeSegments(text)) yield s.segment;
194194
}
195195

196+
const
197+
/** 0x80 */
198+
SEG0_MIN = 128,
199+
/** 0x2FFF */
200+
SEG0_MAX = 12287,
201+
/** 0xA000 */
202+
SEG1_MIN = 40960,
203+
/** 0xDFFF */
204+
SEG1_MAX = 57343;
205+
196206
/**
197-
* Precompute a fast lookup table for BMP code points (0..0xFFFF)
198-
* This table maps each code point to its Grapheme_Cluster_Break category.
199-
* It is generated once at module load time using the grapheme_ranges data.
200-
* The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory.
201-
* For code points >= 0x10000 we fall back to binary search.
207+
* Segmented 4-bit packed lookup tables for BMP code points.
208+
*
209+
* Memory optimization: Skip regions that are almost 100% category {@link GC_Any}:
210+
* - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any ranges -> need to be inlined
211+
* - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any range -> very rare, but quite simple to be inlined
212+
* - 0xFE00-0xFFFF (Specials): 512 codepoints, only 5 ranges -> very rare, fall back to binary search
213+
*
214+
* Cache segments (4-bit packed, 2 categories per byte):
215+
* - SEG0: 0x0080-0x2FFF (12,160 codepoints -> 6,080 bytes)
216+
* - SEG1: 0xA000-0xDFFF (16,384 codepoints -> 8,192 bytes)
217+
*
218+
* Total: 14,272 bytes (~14KB)
202219
*/
203-
let bmpLookup = new Uint8Array(BMP_MAX + 1);
204-
let bmpCursor = (() => {
220+
const SEG0 = new Uint8Array((SEG0_MAX - SEG0_MIN + 1) >> 1);
221+
const SEG1 = new Uint8Array((SEG1_MAX - SEG1_MIN + 1) >> 1);
222+
const SEG_CURSOR = (() => {
205223
let cursor = 0;
206-
let cp = 0;
207-
while (cp <= BMP_MAX) {
208-
let range = grapheme_ranges[cursor++];
209-
for (cp = range[0]; cp <= range[1]; cp++) {
210-
bmpLookup[cp] = range[2];
224+
while (cursor < grapheme_ranges.length) {
225+
let [start, end, cat] = grapheme_ranges[cursor];
226+
if (start > SEG1_MAX) break;
227+
cursor++;
228+
229+
// Skip ranges outside segments (ASCII/CJK/PrivateUse fast paths)
230+
if (end < SEG0_MIN || (start > SEG0_MAX && end < SEG1_MIN)) continue;
231+
232+
for (let cp = start; cp <= end; cp++) {
233+
let /** @type {Uint8Array} */ seg, idx = 0;
234+
235+
if (cp >= SEG0_MIN && cp <= SEG0_MAX) {
236+
seg = SEG0; idx = (cp - SEG0_MIN) >> 1;
237+
} else if (cp >= SEG1_MIN) {
238+
seg = SEG1; idx = (cp - SEG1_MIN) >> 1;
239+
} else continue;
240+
241+
seg[idx] = cp & 1
242+
? (seg[idx] & 0x0F) | (cat << 4)
243+
: (seg[idx] & 0xF0) | cat;
211244
}
212245
}
213246
return cursor;
@@ -222,15 +255,41 @@ let bmpCursor = (() => {
222255
* @return {GraphemeCategoryNum}
223256
*/
224257
function cat(cp) {
225-
// Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
226-
if (cp <= BMP_MAX) {
227-
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]);
258+
// ASCII fast path
259+
if (cp < SEG0_MIN) {
260+
if (cp >= 32) return 0;
261+
if (cp === 10) return 6;
262+
if (cp === 13) return 1;
263+
return 2;
228264
}
229-
230-
// Binary search, starting from bmpCursor
231-
let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
232-
return index < 0 ? 0 : grapheme_ranges[index][2];
233-
};
265+
// Segment 0
266+
if (cp <= SEG0_MAX) {
267+
let byte = SEG0[(cp - SEG0_MIN) >> 1];
268+
return /** @type {GraphemeCategoryNum} */ (cp & 1 ? byte >> 4 : byte & 0x0F);
269+
}
270+
// CJK fast path
271+
if (cp < SEG1_MIN) {
272+
if (cp < 0x3030) return cp >= 0x302A ? 3 : 0;
273+
if (cp < 0x309B) {
274+
if (cp === 0x3030 || cp === 0x303D) return 4;
275+
return cp >= 0x3099 ? 3 : 0;
276+
}
277+
if (cp === 0x3297 || cp === 0x3299) return 4;
278+
return 0;
279+
}
280+
// Segment 1
281+
if (cp <= SEG1_MAX) {
282+
let byte = SEG1[(cp - SEG1_MIN) >> 1];
283+
return /** @type {GraphemeCategoryNum} */ (cp & 1 ? byte >> 4 : byte & 0x0F);
284+
}
285+
// Private Use fast path
286+
if (cp < 0xFE00) {
287+
return cp === 0xFB1E ? 3 : 0;
288+
}
289+
// Specials (0xFE00-0xFFFF) and Non-BMP: binary search
290+
let idx = findUnicodeRangeIndex(cp, grapheme_ranges, SEG_CURSOR);
291+
return idx < 0 ? 0 : grapheme_ranges[idx][2];
292+
}
234293

235294
/**
236295
* @param {number} cp

0 commit comments

Comments
 (0)