Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .changeset/fifty-tips-sit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
"unicode-segmenter": patch
---

Removed inefficient optimization code from grapheme segmenter.

The single range cache is barely hit after the entire BMP cache is hit.
So removed it to reduce code size, and to reduce comparison count.

Worth occupying 64KB of linear memory for BMP. It should definitely be acceptable, as it still uses less heap memory size than executing graphemer's uncompressed code.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,730 | 12,199 | 5,113 | 3,787 | 4,807 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,449 | 12,108 | 5,060 | 3,738 | 4,744 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,301 | 369,576 | 72,225 | 49,483 | 67,964 |
Expand All @@ -235,7 +235,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 21,542 | 11,392 |
| `unicode-segmenter/grapheme` | 21,372 | 11,328 |
| `graphemer` | 134,089 | 31,766 |
| `grapheme-splitter` | 63,946 | 19,162 |

Expand Down
24 changes: 4 additions & 20 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,8 @@ export function* graphemeSegments(input) {
/** Total length of the input string. */
let len = input.length;

/** @type {import('./_grapheme_data.js').GraphemeCategoryRange} */
let cache = [0, 0, 2 /* GC_Control */];

/** Category of codepoint immediately preceding cursor */
let catBefore = cat(cp, cache);
let catBefore = cat(cp);

/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */
let catAfter = null;
Expand Down Expand Up @@ -101,7 +98,7 @@ export function* graphemeSegments(input) {
}

cp = /** @type {number} */ (input.codePointAt(cursor));
catAfter = cat(cp, cache);
catAfter = cat(cp);

if (catBefore === 10 /* Regional_Indicator */) {
risCount++;
Expand Down Expand Up @@ -222,30 +219,17 @@ let bmpCursor = (() => {
* @see https://www.unicode.org/reports/tr29/tr29-43.html#Default_Grapheme_Cluster_Table
*
* @param {number} cp
* @param {import('./_grapheme_data.js').GraphemeCategoryRange} cache
* @return {GraphemeCategoryNum}
*/
function cat(cp, cache) {
function cat(cp) {
// Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
if (cp <= BMP_MAX) {
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]);
}

// Use cached result
if (cp >= cache[0] && cp <= cache[1]) {
return cache[2];
}

// Binary search, starting from bmpCursor
let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
if (index < 0) {
return 0;
}

const range = grapheme_ranges[index];
cache[0] = range[0];
cache[1] = range[1];
return (cache[2] = range[2]);
return index < 0 ? 0 : grapheme_ranges[index][2];
};

/**
Expand Down