@@ -193,21 +193,54 @@ export function* splitGraphemes(text) {
193193 for ( let s of graphemeSegments ( text ) ) yield s . segment ;
194194}
195195
196+ const
197+ /** 0x80 */
198+ SEG0_MIN = 128 ,
199+ /** 0x2FFF */
200+ SEG0_MAX = 12287 ,
201+ /** 0xA000 */
202+ SEG1_MIN = 40960 ,
203+ /** 0xDFFF */
204+ SEG1_MAX = 57343 ;
205+
196206/**
197- * Precompute a fast lookup table for BMP code points (0..0xFFFF)
198- * This table maps each code point to its Grapheme_Cluster_Break category.
199- * It is generated once at module load time using the grapheme_ranges data.
200- * The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory.
201- * For code points >= 0x10000 we fall back to binary search.
207+ * Segmented 4-bit packed lookup tables for BMP code points.
208+ *
209+ * Memory optimization: Skip regions that are almost 100% category {@link GC_Any}:
210+ * - 0x3000-0x9FFF (CJK): 28,672 codepoints, only 12 non-Any ranges -> need to be inlined
211+ * - 0xE000-0xFDFF (Private Use): 7,680 codepoints, only 1 non-Any range -> very rare, but quite simple to be inlined
212+ * - 0xFE00-0xFFFF (Specials): 512 codepoints, only 5 ranges -> very rare, fall back to binary search
213+ *
214+ * Cache segments (4-bit packed, 2 categories per byte):
215+ * - SEG0: 0x0080-0x2FFF (12,160 codepoints -> 6,080 bytes)
216+ * - SEG1: 0xA000-0xDFFF (16,384 codepoints -> 8,192 bytes)
217+ *
218+ * Total: 14,272 bytes (~14KB)
202219 */
203- let bmpLookup = new Uint8Array ( BMP_MAX + 1 ) ;
204- let bmpCursor = ( ( ) => {
220+ const SEG0 = new Uint8Array ( ( SEG0_MAX - SEG0_MIN + 1 ) >> 1 ) ;
221+ const SEG1 = new Uint8Array ( ( SEG1_MAX - SEG1_MIN + 1 ) >> 1 ) ;
222+ const SEG_CURSOR = ( ( ) => {
205223 let cursor = 0 ;
206- let cp = 0 ;
207- while ( cp <= BMP_MAX ) {
208- let range = grapheme_ranges [ cursor ++ ] ;
209- for ( cp = range [ 0 ] ; cp <= range [ 1 ] ; cp ++ ) {
210- bmpLookup [ cp ] = range [ 2 ] ;
224+ while ( cursor < grapheme_ranges . length ) {
225+ let [ start , end , cat ] = grapheme_ranges [ cursor ] ;
226+ if ( start > SEG1_MAX ) break ;
227+ cursor ++ ;
228+
229+ // Skip ranges outside segments (ASCII/CJK/PrivateUse fast paths)
230+ if ( end < SEG0_MIN || ( start > SEG0_MAX && end < SEG1_MIN ) ) continue ;
231+
232+ for ( let cp = start ; cp <= end ; cp ++ ) {
233+ let /** @type {Uint8Array } */ seg , idx = 0 ;
234+
235+ if ( cp >= SEG0_MIN && cp <= SEG0_MAX ) {
236+ seg = SEG0 ; idx = ( cp - SEG0_MIN ) >> 1 ;
237+ } else if ( cp >= SEG1_MIN ) {
238+ seg = SEG1 ; idx = ( cp - SEG1_MIN ) >> 1 ;
239+ } else continue ;
240+
241+ seg [ idx ] = cp & 1
242+ ? ( seg [ idx ] & 0x0F ) | ( cat << 4 )
243+ : ( seg [ idx ] & 0xF0 ) | cat ;
211244 }
212245 }
213246 return cursor ;
@@ -222,15 +255,41 @@ let bmpCursor = (() => {
222255 * @return {GraphemeCategoryNum }
223256 */
224257function cat ( cp ) {
225- // Fast lookup for BMP (0x0000..0xFFFF) using precomputed table
226- if ( cp <= BMP_MAX ) {
227- return /** @type {GraphemeCategoryNum } */ ( bmpLookup [ cp ] ) ;
258+ // ASCII fast path
259+ if ( cp < SEG0_MIN ) {
260+ if ( cp >= 32 ) return 0 ;
261+ if ( cp === 10 ) return 6 ;
262+ if ( cp === 13 ) return 1 ;
263+ return 2 ;
228264 }
229-
230- // Binary search, starting from bmpCursor
231- let index = findUnicodeRangeIndex ( cp , grapheme_ranges , bmpCursor ) ;
232- return index < 0 ? 0 : grapheme_ranges [ index ] [ 2 ] ;
233- } ;
265+ // Segment 0
266+ if ( cp <= SEG0_MAX ) {
267+ let byte = SEG0 [ ( cp - SEG0_MIN ) >> 1 ] ;
268+ return /** @type {GraphemeCategoryNum } */ ( cp & 1 ? byte >> 4 : byte & 0x0F ) ;
269+ }
270+ // CJK fast path
271+ if ( cp < SEG1_MIN ) {
272+ if ( cp < 0x3030 ) return cp >= 0x302A ? 3 : 0 ;
273+ if ( cp < 0x309B ) {
274+ if ( cp === 0x3030 || cp === 0x303D ) return 4 ;
275+ return cp >= 0x3099 ? 3 : 0 ;
276+ }
277+ if ( cp === 0x3297 || cp === 0x3299 ) return 4 ;
278+ return 0 ;
279+ }
280+ // Segment 1
281+ if ( cp <= SEG1_MAX ) {
282+ let byte = SEG1 [ ( cp - SEG1_MIN ) >> 1 ] ;
283+ return /** @type {GraphemeCategoryNum } */ ( cp & 1 ? byte >> 4 : byte & 0x0F ) ;
284+ }
285+ // Private Use fast path
286+ if ( cp < 0xFE00 ) {
287+ return cp === 0xFB1E ? 3 : 0 ;
288+ }
289+ // Specials (0xFE00-0xFFFF) and Non-BMP: binary search
290+ let idx = findUnicodeRangeIndex ( cp , grapheme_ranges , SEG_CURSOR ) ;
291+ return idx < 0 ? 0 : grapheme_ranges [ idx ] [ 2 ] ;
292+ }
234293
235294/**
236295 * @param {number } cp
0 commit comments