Skip to content

Commit 882db6b

Browse files
committed
split segments into separated arrays
1 parent b655642 commit 882db6b

File tree

2 files changed

+15
-19
lines changed

2 files changed

+15
-19
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,823 | 12,532 | 5,309 | 3,942 | 4,999 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,775 | 12,519 | 5,297 | 3,933 | 4,985 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 26,270 | 13,730 |
234+
| `unicode-segmenter/grapheme` | 26,260 | 13,720 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -198,32 +198,30 @@ const
198198
SEG0_MIN = 128,
199199
/** 0x2FFF */
200200
SEG0_MAX = 12287,
201-
/** 0x3000 - 0x80 */
202-
SEG1_OFF = 12160,
203201
/** 0xA000 */
204202
SEG1_MIN = 40960,
205203
/** 0xDFFF */
206204
SEG1_MAX = 57343,
207-
/** SEG1_OFF + (0xE000 - 0xA000) */
208-
SEG2_OFF = 28544,
209205
/** 0xFE00 */
210206
SEG2_MIN = 65024;
211207

212208
/**
213-
* Segmented lookup table for BMP code points.
209+
* Segmented lookup tables for BMP code points.
214210
*
215211
* Memory optimization: Skip regions that are almost 100% category 0 (Any):
216212
* - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any -> inlined fast path
217213
* - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any -> inlined fast path
218214
*
219215
* Cache segments:
220-
* - Segment 0: 0x0080-0x2FFF (12,160 bytes)
221-
* - Segment 1: 0xA000-0xDFFF (16,384 bytes)
222-
* - Segment 2: 0xFE00-0xFFFF (512 bytes)
216+
* - seg0: 0x0080-0x2FFF (12,160 bytes)
217+
* - seg1: 0xA000-0xDFFF (16,384 bytes)
218+
* - seg2: 0xFE00-0xFFFF (512 bytes)
223219
*
224220
* Total: 29,056 bytes (~28KB)
225221
*/
226-
let bmpLookup = new Uint8Array(29056);
222+
let seg0 = new Uint8Array(SEG0_MAX - SEG0_MIN + 1);
223+
let seg1 = new Uint8Array(SEG1_MAX - SEG1_MIN + 1);
224+
let seg2 = new Uint8Array(BMP_MAX - SEG2_MIN + 1);
227225
let bmpCursor = (() => {
228226
let cursor = 0;
229227
while (cursor < grapheme_ranges.length) {
@@ -235,11 +233,9 @@ let bmpCursor = (() => {
235233
if (end < SEG0_MIN || (start > SEG0_MAX && end < SEG1_MIN) || (start > SEG1_MAX && end < SEG2_MIN)) continue;
236234

237235
for (let cp = start; cp <= end && cp <= BMP_MAX; cp++) {
238-
let idx = -1;
239-
if (cp >= SEG0_MIN && cp <= SEG0_MAX) idx = cp - SEG0_MIN;
240-
if (cp >= SEG1_MIN && cp <= SEG1_MAX) idx = SEG1_OFF + (cp - SEG1_MIN);
241-
if (cp >= SEG2_MIN && cp <= BMP_MAX) idx = SEG2_OFF + (cp - SEG2_MIN);
242-
if (idx >= 0) bmpLookup[idx] = cat;
236+
if (cp >= SEG0_MIN && cp <= SEG0_MAX) seg0[cp - SEG0_MIN] = cat;
237+
else if (cp >= SEG1_MIN && cp <= SEG1_MAX) seg1[cp - SEG1_MIN] = cat;
238+
else if (cp >= SEG2_MIN) seg2[cp - SEG2_MIN] = cat;
243239
}
244240
}
245241
return cursor;
@@ -272,7 +268,7 @@ function cat(cp) {
272268
}
273269
// Segment 0
274270
if (cp <= SEG0_MAX) {
275-
return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp - SEG0_MIN]);
271+
return /** @type {GraphemeCategoryNum} */ (seg0[cp - SEG0_MIN]);
276272
}
277273
// CJK fast path
278274
if (cp < SEG1_MIN) {
@@ -286,15 +282,15 @@ function cat(cp) {
286282
}
287283
// Segment 1
288284
if (cp <= SEG1_MAX) {
289-
return /** @type {GraphemeCategoryNum} */ (bmpLookup[SEG1_OFF + (cp - SEG1_MIN)]);
285+
return /** @type {GraphemeCategoryNum} */ (seg1[cp - SEG1_MIN]);
290286
}
291287
// Private Use fast path
292288
if (cp < SEG2_MIN) {
293289
return cp === 0xFB1E ? 3 : 0;
294290
}
295291
// Segment 2
296292
if (cp <= BMP_MAX) {
297-
return /** @type {GraphemeCategoryNum} */ (bmpLookup[SEG2_OFF + (cp - SEG2_MIN)]);
293+
return /** @type {GraphemeCategoryNum} */ (seg2[cp - SEG2_MIN]);
298294
}
299295
// Non-BMP
300296
let idx = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor);

0 commit comments

Comments
 (0)