1414// @ts -check
1515
1616import { findUnicodeRangeIndex } from './core.js' ;
17+
18+ // Precompute a fast lookup table for BMP code points (0..0xFFFF)
19+ // This table maps each code point to its Grapheme_Cluster_Break category.
20+ // It is generated once at module load time using the grapheme_ranges data.
21+ // The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory.
22+ // For code points >= 0x10000 we fall back to binary search as before.
23+
24+ const _bmpCategoryTable = ( ( ) => {
25+ const table = new Uint8Array ( 0x10000 ) ;
26+ // default to GC_Any (0)
27+ // Fill using grapheme_ranges
28+ for ( const [ from , to , cat ] of grapheme_ranges ) {
29+ // Ensure range within BMP
30+ const start = Math . max ( 0 , from ) ;
31+ const end = Math . min ( 0xFFFF , to ) ;
32+ if ( start > 0xFFFF ) continue ;
33+ for ( let cp = start ; cp <= end ; cp ++ ) {
34+ table [ cp ] = cat ;
35+ }
36+ }
37+ return table ;
38+ } ) ( ) ;
1739import { GraphemeCategory , grapheme_ranges } from './_grapheme_data.js' ;
1840import { consonant_ranges } from './_incb_data.js' ;
1941
@@ -203,12 +225,10 @@ export function* splitGraphemes(text) {
203225 * @param {import('./_grapheme_data.js').GraphemeCategoryRange } cache
204226 * @return {GraphemeCategoryNum }
205227 */
228+ // Inlined category lookup for performance
206229function cat ( cp , cache ) {
230+ // Fast path for ASCII characters (same as original for compatibility)
207231 if ( cp < 127 ) {
208- // Special-case optimization for ascii, except U+007F. This
209- // improves performance even for many primarily non-ascii texts,
210- // due to use of punctuation and white space characters from the
211- // ascii range.
212232 if ( cp >= 32 ) {
213233 return 0 /* GC_Any */ ;
214234 } else if ( cp === 10 ) {
@@ -218,21 +238,23 @@ function cat(cp, cache) {
218238 } else {
219239 return 2 /* GC_Control */ ;
220240 }
221- } else {
222- // If this char isn't within the cached range, update the cache to the
223- // range that includes it.
224- if ( cp < cache [ 0 ] || cp > cache [ 1 ] ) {
225- let index = findUnicodeRangeIndex ( cp , grapheme_ranges ) ;
226- if ( index < 0 ) {
227- return 0 ;
228- }
229- let range = grapheme_ranges [ index ] ;
230- cache [ 0 ] = range [ 0 ] ;
231- cache [ 1 ] = range [ 1 ] ;
232- cache [ 2 ] = range [ 2 ] ;
241+ }
242+ // Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
243+ if ( cp <= 0xFFFF ) {
244+ return /** @type {GraphemeCategoryNum } */ ( _bmpCategoryTable [ cp ] ) ;
245+ }
246+ // Fallback for code points beyond BMP: use binary search with cache
247+ if ( cp < cache [ 0 ] || cp > cache [ 1 ] ) {
248+ let index = findUnicodeRangeIndex ( cp , grapheme_ranges ) ;
249+ if ( index < 0 ) {
250+ return 0 ;
233251 }
234- return cache [ 2 ] ;
252+ const range = grapheme_ranges [ index ] ;
253+ cache [ 0 ] = range [ 0 ] ;
254+ cache [ 1 ] = range [ 1 ] ;
255+ cache [ 2 ] = range [ 2 ] ;
235256 }
257+ return cache [ 2 ] ;
236258} ;
237259
238260/**
0 commit comments