Skip to content

Commit 558bbe3

Browse files
committed
Optimizing category lookup, but without packing
1 parent c7ba0ff commit 558bbe3

File tree

2 files changed

+25
-37
lines changed

2 files changed

+25
-37
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,150 | 12,602 | 5,344 | 3,982 | 5,053 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,823 | 12,532 | 5,309 | 3,942 | 4,999 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 26,551 | 13,901 |
234+
| `unicode-segmenter/grapheme` | 26,270 | 13,730 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -198,32 +198,32 @@ const
198198
SEG0_MIN = 128,
199199
/** 0x2FFF */
200200
SEG0_MAX = 12287,
201-
/** (0x3000 - 0x80) >> 1 */
202-
SEG1_OFF = 6080,
201+
/** 0x3000 - 0x80 */
202+
SEG1_OFF = 12160,
203203
/** 0xA000 */
204204
SEG1_MIN = 40960,
205205
/** 0xDFFF */
206206
SEG1_MAX = 57343,
207-
/** SEG1_OFF + ((0xE000 - 0xA000) >> 1) */
208-
SEG2_OFF = 14272,
207+
/** SEG1_OFF + (0xE000 - 0xA000) */
208+
SEG2_OFF = 28544,
209209
/** 0xFE00 */
210210
SEG2_MIN = 65024;
211211

212212
/**
213-
* Segmented 4-bit packed lookup table for BMP code points.
213+
* Segmented lookup table for BMP code points.
214214
*
215215
* Memory optimization: Skip regions that are almost 100% category 0 (Any):
216216
* - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any -> inlined fast path
217217
* - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any -> inlined fast path
218218
*
219219
* Cache segments:
220-
* - Segment 0: 0x0080-0x2FFF (12,160 codepoints -> 6,080 bytes)
221-
* - Segment 1: 0xA000-0xDFFF (16,384 codepoints -> 8,192 bytes)
222-
* - Segment 2: 0xFE00-0xFFFF (512 codepoints -> 256 bytes)
220+
* - Segment 0: 0x0080-0x2FFF (12,160 bytes)
221+
* - Segment 1: 0xA000-0xDFFF (16,384 bytes)
222+
* - Segment 2: 0xFE00-0xFFFF (512 bytes)
223223
*
224-
* Total: 14,528 bytes (~14KB)
224+
* Total: 29,056 bytes (~28KB)
225225
*/
226-
let bmpLookup = new Uint8Array(14528);
226+
let bmpLookup = new Uint8Array(29056);
227227
let bmpCursor = (() => {
228228
let cursor = 0;
229229
while (cursor < grapheme_ranges.length) {
@@ -236,14 +236,10 @@ let bmpCursor = (() => {
236236

237237
for (let cp = start; cp <= end && cp <= BMP_MAX; cp++) {
238238
let idx = -1;
239-
if (cp >= SEG0_MIN && cp <= SEG0_MAX) idx = (cp - SEG0_MIN) >> 1;
240-
if (cp >= SEG1_MIN && cp <= SEG1_MAX) idx = SEG1_OFF + ((cp - SEG1_MIN) >> 1);
241-
if (cp >= SEG2_MIN && cp <= BMP_MAX) idx = SEG2_OFF + ((cp - SEG2_MIN) >> 1);
242-
if (idx >= 0) {
243-
bmpLookup[idx] = cp & 1
244-
? (bmpLookup[idx] & 0x0F) | (cat << 4)
245-
: (bmpLookup[idx] & 0xF0) | cat;
246-
}
239+
if (cp >= SEG0_MIN && cp <= SEG0_MAX) idx = cp - SEG0_MIN;
240+
if (cp >= SEG1_MIN && cp <= SEG1_MAX) idx = SEG1_OFF + (cp - SEG1_MIN);
241+
if (cp >= SEG2_MIN && cp <= BMP_MAX) idx = SEG2_OFF + (cp - SEG2_MIN);
242+
if (idx >= 0) bmpLookup[idx] = cat;
247243
}
248244
}
249245
return cursor;
@@ -258,13 +254,13 @@ let bmpCursor = (() => {
258254
* @return {GraphemeCategoryNum}
259255
*/
260256
function cat(cp) {
261-
// Ordered pass by range:
257+
// Ordered pass by range:
262258
// 1. ASCII fast path
263259
// 2. Segment 0 cache
264-
// 3. CJK fast path
265-
// 4. Segment 1 cache
266-
// 5. PrivateUse fast path
267-
// 6. Segment 2 cache
260+
// 3. CJK fast path
261+
// 4. Segment 1 cache
262+
// 5. PrivateUse fast path
263+
// 6. Segment 2 cache
268264
// 7. Non-BMP binary search
269265

270266
// ASCII fast path
@@ -274,13 +270,9 @@ function cat(cp) {
274270
if (cp === 13) return 1;
275271
return 2;
276272
}
277-
278-
let byte = 0, idx = -1;
279273
// Segment 0
280274
if (cp <= SEG0_MAX) {
281-
idx = (cp - SEG0_MIN) >> 1;
282-
byte = bmpLookup[idx];
283-
return /** @type {GraphemeCategoryNum} */ (cp & 1 ? byte >> 4 : byte & 0x0F);
275+
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp - SEG0_MIN]);
284276
}
285277
// CJK fast path
286278
if (cp < SEG1_MIN) {
@@ -294,22 +286,18 @@ function cat(cp) {
294286
}
295287
// Segment 1
296288
if (cp <= SEG1_MAX) {
297-
idx = SEG1_OFF + ((cp - SEG1_MIN) >> 1);
298-
byte = bmpLookup[idx];
299-
return /** @type {GraphemeCategoryNum} */ (cp & 1 ? byte >> 4 : byte & 0x0F);
289+
return /** @type {GraphemeCategoryNum} */ (bmpLookup[SEG1_OFF + (cp - SEG1_MIN)]);
300290
}
301291
// Private Use fast path
302292
if (cp < SEG2_MIN) {
303293
return cp === 0xFB1E ? 3 : 0;
304294
}
305295
// Segment 2
306296
if (cp <= BMP_MAX) {
307-
idx = SEG2_OFF + ((cp - SEG2_MIN) >> 1);
308-
byte = bmpLookup[idx];
309-
return /** @type {GraphemeCategoryNum} */ (cp & 1 ? byte >> 4 : byte & 0x0F);
297+
return /** @type {GraphemeCategoryNum} */ (bmpLookup[SEG2_OFF + (cp - SEG2_MIN)]);
310298
}
311299
// Non-BMP
312-
idx = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
300+
let idx = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);
313301
return idx < 0 ? 0 : grapheme_ranges[idx][2];
314302
}
315303

0 commit comments

Comments
 (0)