Skip to content

Commit ac96013

Browse files
authored
get rid of inefficient range caching (#98)
* get rid of inefficient range caching * update bundle stats * changeset
1 parent a088d10 commit ac96013

File tree

3 files changed

+16
-22
lines changed

3 files changed

+16
-22
lines changed

.changeset/fifty-tips-sit.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Removed inefficient optimization code from grapheme segmenter.
6+
7+
The single range cache is barely hit after the entire BMP cache is hit.
8+
So removed it to reduce code size, and to reduce comparison count.
9+
10+
Worth occupying 64KB of linear memory for BMP. It should definitely be acceptable, as it still uses less heap memory size than executing graphemer's uncompressed code.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
219219

220220
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
221221
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
222-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,730 | 12,199 | 5,113 | 3,787 | 4,807 |
222+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,449 | 12,108 | 5,060 | 3,738 | 4,744 |
223223
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
224224
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
225225
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,301 | 369,576 | 72,225 | 49,483 | 67,964 |
@@ -235,7 +235,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
235235

236236
| Name | Bytecode size | Bytecode size (gzip)* |
237237
|------------------------------|--------------:|----------------------:|
238-
| `unicode-segmenter/grapheme` | 21,542 | 11,392 |
238+
| `unicode-segmenter/grapheme` | 21,372 | 11,328 |
239239
| `graphemer` | 134,089 | 31,766 |
240240
| `grapheme-splitter` | 63,946 | 19,162 |
241241

src/grapheme.js

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,8 @@ export function* graphemeSegments(input) {
5656
/** Total length of the input string. */
5757
let len = input.length;
5858

59-
/** @type {import('./_grapheme_data.js').GraphemeCategoryRange} */
60-
let cache = [0, 0, 2 /* GC_Control */];
61-
6259
/** Category of codepoint immediately preceding cursor */
63-
let catBefore = cat(cp, cache);
60+
let catBefore = cat(cp);
6461

6562
/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */
6663
let catAfter = null;
@@ -101,7 +98,7 @@ export function* graphemeSegments(input) {
10198
}
10299

103100
cp = /** @type {number} */ (input.codePointAt(cursor));
104-
catAfter = cat(cp, cache);
101+
catAfter = cat(cp);
105102

106103
if (catBefore === 10 /* Regional_Indicator */) {
107104
risCount++;
@@ -222,30 +219,17 @@ let bmpCursor = (() => {
222219
* @see https://www.unicode.org/reports/tr29/tr29-43.html#Default_Grapheme_Cluster_Table
223220
*
224221
* @param {number} cp
225-
* @param {import('./_grapheme_data.js').GraphemeCategoryRange} cache
226222
* @return {GraphemeCategoryNum}
227223
*/
228-
function cat(cp, cache) {
224+
function cat(cp) {
229225
// Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
230226
if (cp <= BMP_MAX) {
231227
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]);
232228
}
233229

234-
// Use cached result
235-
if (cp >= cache[0] && cp <= cache[1]) {
236-
return cache[2];
237-
}
238-
239230
// Binary search, starting from bmpCursor
240231
let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
241-
if (index < 0) {
242-
return 0;
243-
}
244-
245-
const range = grapheme_ranges[index];
246-
cache[0] = range[0];
247-
cache[1] = range[1];
248-
return (cache[2] = range[2]);
232+
return index < 0 ? 0 : grapheme_ranges[index][2];
249233
};
250234

251235
/**

0 commit comments

Comments
 (0)