Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changeset/bitter-suits-arrive.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
"unicode-segmenter": patch
---

Improve runtime perf on the Unicode text processing.

By using a precomputed lookup table for the grapheme categries of BMP characters, it improves perf by more than 10% for common cases, even ~30% for some extream cases.

The lookup table consumes an additional 64 KB of memory, which is acceptable for most JavaScript runtime environments.

This optimization is introduced by OpenCode w/ OpenAI's GPT-OSS-120B. It is the second successful attempt at meaningful optimization in this library.
(The first one was the Claude Code w/ Claude Opus 4.0)
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,588 | 12,168 | 5,038 | 3,715 | 4,727 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,730 | 12,199 | 5,113 | 3,787 | 4,807 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 | 6,750 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,285 | 369,560 | 72,218 | 49,416 | 67,975 |
Expand All @@ -236,7 +236,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 21,001 | 11,065 |
| `unicode-segmenter/grapheme` | 21,435 | 11,351 |
| `graphemer` | 133,978 | 31,713 |
| `grapheme-splitter` | 63,835 | 19,137 |

Expand All @@ -246,16 +246,16 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

Here is a brief explanation, and you can see [archived benchmark results](benchmark/grapheme/_records).

**Performance in Node.js**: `unicode-segmenter/grapheme` is significantly faster than alternatives.
- 6\~15x faster than other JavaScript libraries
- 1.5\~3x faster than WASM binding of the Rust's [unicode-segmentation]
- 1.5\~3x faster than built-in [`Intl.Segmenter`]
**Performance in Node.js/Bun/Deno**: `unicode-segmenter/grapheme` has best-in-class performance.
- 8\~35x faster than other JavaScript libraries.
- 3\~5x faster than WASM binding of the Rust's [unicode-segmentation].
- 2\~3x faster than built-in [`Intl.Segmenter`].

**Performance in Bun**: `unicode-segmenter/grapheme` has almost the same performance as the built-in [`Intl.Segmenter`], with no performance degradation compared to other JavaScript libraries.
**Performance in Browsers**: The performance in browser environments varies greatly due to differences in browser engines, which makes benchmarking inconsistent, but:
- Still significantly faster than other JavaScript libraries.
- Generally outperforms the built-in in the most browser environments, except the Firefox.

**Performance in Browsers**: The performance in browser environments varies greatly due to differences in browser engines and versions, which makes benchmarking less consistent. Despite these variations, `unicode-segmenter/grapheme` generally outperforms other JavaScript libraries in most environments.

**Performance in React Native**: `unicode-segmenter/grapheme` is significantly faster than alternatives when compiled to Hermes bytecode. It's 3\~8x faster than `graphemer` and 20\~26x faster than `grapheme-splitter`, with the performance gap increasing with input size.
**Performance in React Native**: `unicode-segmenter/grapheme` is still faster than alternatives when compiled to Hermes bytecode. It's 3\~8x faster than `graphemer` and 20\~26x faster than `grapheme-splitter`, with the performance gap increasing with input size.

**Performance in QuickJS**: `unicode-segmenter/grapheme` is the only usable library in terms of performance.

Expand Down
4 changes: 1 addition & 3 deletions src/core.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ export function decodeUnicodeData(data, cats = '') {
* @param {CategorizedUnicodeRange<T>[]} ranges
* @return {number} index of matched unicode range, or -1 if no match
*/
export function findUnicodeRangeIndex(cp, ranges) {
let lo = 0
, hi = ranges.length - 1;
export function findUnicodeRangeIndex(cp, ranges, lo = 0, hi = ranges.length - 1) {
while (lo <= hi) {
let mid = lo + hi >>> 1
, range = ranges[mid];
Expand Down
136 changes: 73 additions & 63 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ import { consonant_ranges } from './_incb_data.js';

export { GraphemeCategory };

const BMP_MAX = 0xFFFF;

/**
* Unicode segmentation by extended grapheme rules.
*
Expand All @@ -49,7 +51,7 @@ export function* graphemeSegments(input) {
if (cp == null) return;

/** Current cursor position. */
let cursor = cp <= 0xFFFF ? 1 : 2;
let cursor = cp <= BMP_MAX ? 1 : 2;

/** Total length of the input string. */
let len = input.length;
Expand Down Expand Up @@ -137,7 +139,7 @@ export function* graphemeSegments(input) {
_hd = cp;
}

cursor += cp <= 0xFFFF ? 1 : 2;
cursor += cp <= BMP_MAX ? 1 : 2;
catBefore = catAfter;
}

Expand Down Expand Up @@ -194,6 +196,26 @@ export function* splitGraphemes(text) {
for (let s of graphemeSegments(text)) yield s.segment;
}

/**
* Precompute a fast lookup table for BMP code points (0..0xFFFF)
* This table maps each code point to its Grapheme_Cluster_Break category.
* It is generated once at module load time using the grapheme_ranges data.
* The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory.
* For code points >= 0x10000 we fall back to binary search.
*/
let bmpLookup = new Uint8Array(BMP_MAX + 1);
let bmpCursor = (() => {
let cursor = 0;
let cp = 0;
while (cp <= BMP_MAX) {
let range = grapheme_ranges[cursor++];
for (cp = range[0]; cp <= range[1]; cp++) {
bmpLookup[cp] = range[2];
}
}
return cursor;
})();

/**
* `Grapheme_Cluster_Break` property value of a given codepoint
*
Expand All @@ -204,35 +226,26 @@ export function* splitGraphemes(text) {
* @return {GraphemeCategoryNum}
*/
function cat(cp, cache) {
if (cp < 127) {
// Special-case optimization for ascii, except U+007F. This
// improves performance even for many primarily non-ascii texts,
// due to use of punctuation and white space characters from the
// ascii range.
if (cp >= 32) {
return 0 /* GC_Any */;
} else if (cp === 10) {
return 6 /* GC_LF */;
} else if (cp === 13) {
return 1 /* GC_CR */;
} else {
return 2 /* GC_Control */;
}
} else {
// If this char isn't within the cached range, update the cache to the
// range that includes it.
if (cp < cache[0] || cp > cache[1]) {
let index = findUnicodeRangeIndex(cp, grapheme_ranges);
if (index < 0) {
return 0;
}
let range = grapheme_ranges[index];
cache[0] = range[0];
cache[1] = range[1];
cache[2] = range[2];
}
// Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
if (cp <= BMP_MAX) {
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]);
}

// Use cached result
if (cp >= cache[0] && cp <= cache[1]) {
return cache[2];
}

// Binary search, starting from bmpCursor
let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
if (index < 0) {
return 0;
}

const range = grapheme_ranges[index];
cache[0] = range[0];
cache[1] = range[1];
return (cache[2] = range[2]);
};

/**
Expand Down Expand Up @@ -291,46 +304,43 @@ function isBoundary(catBefore, catAfter, risCount, emoji, incb) {

// GB6 - L x (L | V | LV | LVT)
if (catBefore === 5) {
if (catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13) {
return false;
}
return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13);
}

} else {
// GB7 - (LV | V) x (V | T)
if (
(catBefore === 7 || catBefore === 13) &&
(catAfter === 13 || catAfter === 12)
) {
return false;
}
// GB7 - (LV | V) x (V | T)
if (
(catBefore === 7 || catBefore === 13) &&
(catAfter === 13 || catAfter === 12)
) {
return false;
}

// GB8 - (LVT | T) x T
if (
(catBefore === 8 || catBefore === 12) &&
catAfter === 12
) {
return false;
}
// GB8 - (LVT | T) x T
if (
(catBefore === 8 || catBefore === 12) &&
catAfter === 12
) {
return false;
}

// GB9b
if (catBefore === 9) {
return false;
}
// GB9b
if (catBefore === 9) {
return false;
}

// GB9c
if (catAfter === 0 && incb) {
return false;
}
// GB9c
if (catAfter === 0 && incb) {
return false;
}

// GB11
if (catBefore === 14 && catAfter === 4) {
return !emoji;
}
// GB11
if (catBefore === 14 && catAfter === 4) {
return !emoji;
}

// GB12, GB13
if (catBefore === 10 && catAfter === 10) {
return risCount % 2 === 0;
}
// GB12, GB13
if (catBefore === 10 && catAfter === 10) {
return risCount % 2 === 0;
}

// GB999
Expand Down