diff --git a/.changeset/fifty-tips-sit.md b/.changeset/fifty-tips-sit.md new file mode 100644 index 0000000..a568b72 --- /dev/null +++ b/.changeset/fifty-tips-sit.md @@ -0,0 +1,10 @@ +--- +"unicode-segmenter": patch +--- + +Removed inefficient optimization code from grapheme segmenter. + +The single range cache is barely hit after the entire BMP cache is hit. +So removed it to reduce code size, and to reduce comparison count. + +Worth occupying 64KB of linear memory for BMP. It should definitely be acceptable, as it still uses less heap memory size than executing graphemer's uncompressed code. diff --git a/README.md b/README.md index a082634..ea7ab5a 100644 --- a/README.md +++ b/README.md @@ -219,7 +219,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:| -| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,730 | 12,199 | 5,113 | 3,787 | 4,807 | +| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,449 | 12,108 | 5,060 | 3,738 | 4,744 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 | | `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,301 | 369,576 | 72,225 | 49,483 | 67,964 | @@ -235,7 +235,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 21,542 | 11,392 | +| `unicode-segmenter/grapheme` | 21,372 | 11,328 | | `graphemer` | 134,089 | 31,766 | | `grapheme-splitter` | 63,946 | 19,162 | diff --git a/src/grapheme.js b/src/grapheme.js index 3c6e070..d7f227c 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -56,11 +56,8 @@ export function* graphemeSegments(input) { /** Total length of the input string. */ let len = input.length; - /** @type {import('./_grapheme_data.js').GraphemeCategoryRange} */ - let cache = [0, 0, 2 /* GC_Control */]; - /** Category of codepoint immediately preceding cursor */ - let catBefore = cat(cp, cache); + let catBefore = cat(cp); /** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */ let catAfter = null; @@ -101,7 +98,7 @@ export function* graphemeSegments(input) { } cp = /** @type {number} */ (input.codePointAt(cursor)); - catAfter = cat(cp, cache); + catAfter = cat(cp); if (catBefore === 10 /* Regional_Indicator */) { risCount++; @@ -222,30 +219,17 @@ let bmpCursor = (() => { * @see https://www.unicode.org/reports/tr29/tr29-43.html#Default_Grapheme_Cluster_Table * * @param {number} cp - * @param {import('./_grapheme_data.js').GraphemeCategoryRange} cache * @return {GraphemeCategoryNum} */ -function cat(cp, cache) { +function cat(cp) { // Fast lookup for BMP (0x0000..0xFFFF) using precomputed table if (cp <= BMP_MAX) { return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]); } - // Use cached result - if (cp >= cache[0] && cp <= cache[1]) { - return cache[2]; - } - // Binary search, starting from bmpCursor let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor); - if (index < 0) { - return 0; - } - - const range = grapheme_ranges[index]; - cache[0] = range[0]; - cache[1] = range[1]; - return (cache[2] = range[2]); + return index < 0 ? 0 : grapheme_ranges[index][2]; }; /**