diff --git a/.changeset/bitter-suits-arrive.md b/.changeset/bitter-suits-arrive.md new file mode 100644 index 0000000..f61049f --- /dev/null +++ b/.changeset/bitter-suits-arrive.md @@ -0,0 +1,12 @@ +--- +"unicode-segmenter": patch +--- + +Improve runtime perf on the Unicode text processing. + +By using a precomputed lookup table for the grapheme categries of BMP characters, it improves perf by more than 10% for common cases, even ~30% for some extream cases. + +The lookup table consumes an additional 64 KB of memory, which is acceptable for most JavaScript runtime environments. + +This optimization is introduced by OpenCode w/ OpenAI's GPT-OSS-120B. It is the second successful attempt at meaningful optimization in this library. +(The first one was the Claude Code w/ Claude Opus 4.0) diff --git a/README.md b/README.md index 3b49ce3..2444b0f 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:| -| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,588 | 12,168 | 5,038 | 3,715 | 4,727 | +| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,730 | 12,199 | 5,113 | 3,787 | 4,807 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 | 6,750 | | `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,285 | 369,560 | 72,218 | 49,416 | 67,975 | @@ -236,7 +236,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 21,001 | 11,065 | +| `unicode-segmenter/grapheme` | 21,435 | 11,351 | | `graphemer` | 133,978 | 31,713 | | `grapheme-splitter` | 63,835 | 19,137 | @@ -246,16 +246,16 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb Here is a brief explanation, and you can see [archived benchmark results](benchmark/grapheme/_records). -**Performance in Node.js**: `unicode-segmenter/grapheme` is significantly faster than alternatives. -- 6\~15x faster than other JavaScript libraries -- 1.5\~3x faster than WASM binding of the Rust's [unicode-segmentation] -- 1.5\~3x faster than built-in [`Intl.Segmenter`] +**Performance in Node.js/Bun/Deno**: `unicode-segmenter/grapheme` has best-in-class performance. +- 8\~35x faster than other JavaScript libraries. +- 3\~5x faster than WASM binding of the Rust's [unicode-segmentation]. +- 2\~3x faster than built-in [`Intl.Segmenter`]. -**Performance in Bun**: `unicode-segmenter/grapheme` has almost the same performance as the built-in [`Intl.Segmenter`], with no performance degradation compared to other JavaScript libraries. +**Performance in Browsers**: The performance in browser environments varies greatly due to differences in browser engines, which makes benchmarking inconsistent, but: +- Still significantly faster than other JavaScript libraries. +- Generally outperforms the built-in in the most browser environments, except the Firefox. -**Performance in Browsers**: The performance in browser environments varies greatly due to differences in browser engines and versions, which makes benchmarking less consistent. Despite these variations, `unicode-segmenter/grapheme` generally outperforms other JavaScript libraries in most environments. - -**Performance in React Native**: `unicode-segmenter/grapheme` is significantly faster than alternatives when compiled to Hermes bytecode. It's 3\~8x faster than `graphemer` and 20\~26x faster than `grapheme-splitter`, with the performance gap increasing with input size. +**Performance in React Native**: `unicode-segmenter/grapheme` is still faster than alternatives when compiled to Hermes bytecode. It's 3\~8x faster than `graphemer` and 20\~26x faster than `grapheme-splitter`, with the performance gap increasing with input size. **Performance in QuickJS**: `unicode-segmenter/grapheme` is the only usable library in terms of performance. diff --git a/src/core.js b/src/core.js index 1c4f9cb..8ac4448 100644 --- a/src/core.js +++ b/src/core.js @@ -63,9 +63,7 @@ export function decodeUnicodeData(data, cats = '') { * @param {CategorizedUnicodeRange[]} ranges * @return {number} index of matched unicode range, or -1 if no match */ -export function findUnicodeRangeIndex(cp, ranges) { - let lo = 0 - , hi = ranges.length - 1; +export function findUnicodeRangeIndex(cp, ranges, lo = 0, hi = ranges.length - 1) { while (lo <= hi) { let mid = lo + hi >>> 1 , range = ranges[mid]; diff --git a/src/grapheme.js b/src/grapheme.js index bdbda24..3c6e070 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -33,6 +33,8 @@ import { consonant_ranges } from './_incb_data.js'; export { GraphemeCategory }; +const BMP_MAX = 0xFFFF; + /** * Unicode segmentation by extended grapheme rules. * @@ -49,7 +51,7 @@ export function* graphemeSegments(input) { if (cp == null) return; /** Current cursor position. */ - let cursor = cp <= 0xFFFF ? 1 : 2; + let cursor = cp <= BMP_MAX ? 1 : 2; /** Total length of the input string. */ let len = input.length; @@ -137,7 +139,7 @@ export function* graphemeSegments(input) { _hd = cp; } - cursor += cp <= 0xFFFF ? 1 : 2; + cursor += cp <= BMP_MAX ? 1 : 2; catBefore = catAfter; } @@ -194,6 +196,26 @@ export function* splitGraphemes(text) { for (let s of graphemeSegments(text)) yield s.segment; } +/** + * Precompute a fast lookup table for BMP code points (0..0xFFFF) + * This table maps each code point to its Grapheme_Cluster_Break category. + * It is generated once at module load time using the grapheme_ranges data. + * The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory. + * For code points >= 0x10000 we fall back to binary search. + */ +let bmpLookup = new Uint8Array(BMP_MAX + 1); +let bmpCursor = (() => { + let cursor = 0; + let cp = 0; + while (cp <= BMP_MAX) { + let range = grapheme_ranges[cursor++]; + for (cp = range[0]; cp <= range[1]; cp++) { + bmpLookup[cp] = range[2]; + } + } + return cursor; +})(); + /** * `Grapheme_Cluster_Break` property value of a given codepoint * @@ -204,35 +226,26 @@ export function* splitGraphemes(text) { * @return {GraphemeCategoryNum} */ function cat(cp, cache) { - if (cp < 127) { - // Special-case optimization for ascii, except U+007F. This - // improves performance even for many primarily non-ascii texts, - // due to use of punctuation and white space characters from the - // ascii range. - if (cp >= 32) { - return 0 /* GC_Any */; - } else if (cp === 10) { - return 6 /* GC_LF */; - } else if (cp === 13) { - return 1 /* GC_CR */; - } else { - return 2 /* GC_Control */; - } - } else { - // If this char isn't within the cached range, update the cache to the - // range that includes it. - if (cp < cache[0] || cp > cache[1]) { - let index = findUnicodeRangeIndex(cp, grapheme_ranges); - if (index < 0) { - return 0; - } - let range = grapheme_ranges[index]; - cache[0] = range[0]; - cache[1] = range[1]; - cache[2] = range[2]; - } + // Fast lookup for BMP (0x0000..0xFFFF) using precomputed table + if (cp <= BMP_MAX) { + return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]); + } + + // Use cached result + if (cp >= cache[0] && cp <= cache[1]) { return cache[2]; } + + // Binary search, starting from bmpCursor + let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor); + if (index < 0) { + return 0; + } + + const range = grapheme_ranges[index]; + cache[0] = range[0]; + cache[1] = range[1]; + return (cache[2] = range[2]); }; /** @@ -291,46 +304,43 @@ function isBoundary(catBefore, catAfter, risCount, emoji, incb) { // GB6 - L x (L | V | LV | LVT) if (catBefore === 5) { - if (catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13) { - return false; - } + return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13); + } - } else { - // GB7 - (LV | V) x (V | T) - if ( - (catBefore === 7 || catBefore === 13) && - (catAfter === 13 || catAfter === 12) - ) { - return false; - } + // GB7 - (LV | V) x (V | T) + if ( + (catBefore === 7 || catBefore === 13) && + (catAfter === 13 || catAfter === 12) + ) { + return false; + } - // GB8 - (LVT | T) x T - if ( - (catBefore === 8 || catBefore === 12) && - catAfter === 12 - ) { - return false; - } + // GB8 - (LVT | T) x T + if ( + (catBefore === 8 || catBefore === 12) && + catAfter === 12 + ) { + return false; + } - // GB9b - if (catBefore === 9) { - return false; - } + // GB9b + if (catBefore === 9) { + return false; + } - // GB9c - if (catAfter === 0 && incb) { - return false; - } + // GB9c + if (catAfter === 0 && incb) { + return false; + } - // GB11 - if (catBefore === 14 && catAfter === 4) { - return !emoji; - } + // GB11 + if (catBefore === 14 && catAfter === 4) { + return !emoji; + } - // GB12, GB13 - if (catBefore === 10 && catAfter === 10) { - return risCount % 2 === 0; - } + // GB12, GB13 + if (catBefore === 10 && catAfter === 10) { + return risCount % 2 === 0; } // GB999