Skip to content

Commit 9dc8e80

Browse files
committed
edit by GPT-OSS-120B
1 parent b351f2a commit 9dc8e80

File tree

1 file changed

+39
-17
lines changed

1 file changed

+39
-17
lines changed

src/grapheme.js

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,28 @@
1414
// @ts-check
1515

1616
import { findUnicodeRangeIndex } from './core.js';
17+
18+
// Precompute a fast lookup table for BMP code points (0..0xFFFF)
19+
// This table maps each code point to its Grapheme_Cluster_Break category.
20+
// It is generated once at module load time using the grapheme_ranges data.
21+
// The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory.
22+
// For code points >= 0x10000 we fall back to binary search as before.
23+
24+
const _bmpCategoryTable = (() => {
25+
const table = new Uint8Array(0x10000);
26+
// default to GC_Any (0)
27+
// Fill using grapheme_ranges
28+
for (const [from, to, cat] of grapheme_ranges) {
29+
// Ensure range within BMP
30+
const start = Math.max(0, from);
31+
const end = Math.min(0xFFFF, to);
32+
if (start > 0xFFFF) continue;
33+
for (let cp = start; cp <= end; cp++) {
34+
table[cp] = cat;
35+
}
36+
}
37+
return table;
38+
})();
1739
import { GraphemeCategory, grapheme_ranges } from './_grapheme_data.js';
1840
import { consonant_ranges } from './_incb_data.js';
1941

@@ -203,12 +225,10 @@ export function* splitGraphemes(text) {
203225
* @param {import('./_grapheme_data.js').GraphemeCategoryRange} cache
204226
* @return {GraphemeCategoryNum}
205227
*/
228+
// Inlined category lookup for performance
206229
function cat(cp, cache) {
230+
// Fast path for ASCII characters (same as original for compatibility)
207231
if (cp < 127) {
208-
// Special-case optimization for ascii, except U+007F. This
209-
// improves performance even for many primarily non-ascii texts,
210-
// due to use of punctuation and white space characters from the
211-
// ascii range.
212232
if (cp >= 32) {
213233
return 0 /* GC_Any */;
214234
} else if (cp === 10) {
@@ -218,21 +238,23 @@ function cat(cp, cache) {
218238
} else {
219239
return 2 /* GC_Control */;
220240
}
221-
} else {
222-
// If this char isn't within the cached range, update the cache to the
223-
// range that includes it.
224-
if (cp < cache[0] || cp > cache[1]) {
225-
let index = findUnicodeRangeIndex(cp, grapheme_ranges);
226-
if (index < 0) {
227-
return 0;
228-
}
229-
let range = grapheme_ranges[index];
230-
cache[0] = range[0];
231-
cache[1] = range[1];
232-
cache[2] = range[2];
241+
}
242+
// Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
243+
if (cp <= 0xFFFF) {
244+
return /** @type {GraphemeCategoryNum} */(_bmpCategoryTable[cp]);
245+
}
246+
// Fallback for code points beyond BMP: use binary search with cache
247+
if (cp < cache[0] || cp > cache[1]) {
248+
let index = findUnicodeRangeIndex(cp, grapheme_ranges);
249+
if (index < 0) {
250+
return 0;
233251
}
234-
return cache[2];
252+
const range = grapheme_ranges[index];
253+
cache[0] = range[0];
254+
cache[1] = range[1];
255+
cache[2] = range[2];
235256
}
257+
return cache[2];
236258
};
237259

238260
/**

0 commit comments

Comments
 (0)