@@ -59,52 +59,94 @@ export function* graphemeSegments(input) {
5959 /** Category of codepoint immediately preceding cursor */
6060 let catBefore = cat ( cp ) ;
6161
62- /** @type {GraphemeCategoryNum | null } Category of codepoint immediately preceding cursor. */
63- let catAfter = null ;
62+ /** @type {GraphemeCategoryNum } Category of codepoint immediately preceding cursor. */
63+ let catAfter = 0 ;
6464
6565 /** The number of RIS codepoints preceding `cursor`. */
6666 let risCount = 0 ;
6767
68- /** Emoji state */
68+ /**
69+ * Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
70+ * Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
71+ */
6972 let emoji = false ;
7073
71- /** InCB=Consonant */
74+ /** InCB=Consonant - segment started with Indic consonant */
7275 let consonant = false ;
7376
74- /** InCB=Linker */
77+ /** InCB=Linker - seen a linker after consonant */
7578 let linker = false ;
7679
77- /** InCB=Consonant InCB=Linker x InCB=Consonant */
78- let incb = false ;
79-
8080 let index = 0 ;
8181
8282 /** Beginning category of a segment */
8383 let _catBegin = catBefore ;
8484
85- /** Memoize the beginnig code point of the segment. */
85+ /** Memoize the beginning code point of the segment. */
8686 let _hd = cp ;
8787
8888 while ( cursor < len ) {
8989 cp = /** @type {number } */ ( input . codePointAt ( cursor ) ) ;
9090 catAfter = cat ( cp ) ;
9191
92- if ( catBefore === 10 /* Regional_Indicator */ ) {
93- risCount ++ ;
94- } else {
95- risCount = 0 ;
96- if (
97- catAfter === 14 /* ZWJ */
98- && ( catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */ )
99- ) {
100- emoji = true ;
92+ let boundary = true ;
10193
102- } else if ( catAfter === 0 ) {
103- incb = consonant && linker && isIndicConjunctConsonant ( cp ) ;
94+ // GB3: CR × LF
95+ if ( catBefore === 1 ) {
96+ boundary = catAfter !== 6 ;
97+ // GB4 is implicit: CR breaks unless followed by LF
98+ }
99+ // GB4: (Control | CR | LF) ÷
100+ else if ( catBefore === 2 || catBefore === 6 ) {
101+ boundary = true ;
102+ }
103+ // GB5: ÷ (Control | CR | LF)
104+ else if ( catAfter === 1 || catAfter === 2 || catAfter === 6 ) {
105+ boundary = true ;
106+ }
107+ // GB9, GB9a: × (Extend | ZWJ | SpacingMark) - most common no-break case
108+ else if ( catAfter === 3 || catAfter === 14 || catAfter === 11 ) {
109+ boundary = false ;
110+ // Update emoji state for GB11: track Extend/ExtPic followed by ZWJ
111+ if ( catAfter === 14 && ( catBefore === 3 || catBefore === 4 ) ) {
112+ emoji = true ;
104113 }
105114 }
115+ // GB9b: Prepend ×
116+ else if ( catBefore === 9 ) {
117+ boundary = false ;
118+ }
119+ // GB11: ExtPic Extend* ZWJ × ExtPic
120+ else if ( catBefore === 14 && catAfter === 4 ) {
121+ boundary = ! emoji ;
122+ // emoji state consumed, will be reset on boundary
123+ }
124+ // GB12, GB13: RI × RI (odd count means no break)
125+ else if ( catBefore === 10 && catAfter === 10 ) {
126+ // risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
127+ boundary = risCount % 2 === 1 ;
128+ }
129+ // GB6: L × (L | V | LV | LVT)
130+ else if ( catBefore === 5 ) {
131+ boundary = ! ( catAfter === 5 || catAfter === 13 || catAfter === 7 || catAfter === 8 ) ;
132+ }
133+ // GB7: (LV | V) × (V | T)
134+ else if ( ( catBefore === 7 || catBefore === 13 ) && ( catAfter === 13 || catAfter === 12 ) ) {
135+ boundary = false ;
136+ }
137+ // GB8: (LVT | T) × T
138+ else if ( ( catBefore === 8 || catBefore === 12 ) && catAfter === 12 ) {
139+ boundary = false ;
140+ }
141+ // GB9c: InCB=Consonant InCB=Extend* InCB=Linker InCB=Extend* × InCB=Consonant
142+ else if ( catAfter === 0 && consonant && linker && isIndicConjunctConsonant ( cp ) ) {
143+ boundary = false ;
144+ linker = false ;
145+ consonant = false ;
146+ }
147+ // else GB999: ÷ Any
106148
107- if ( isBoundary ( catBefore , catAfter , risCount , emoji , incb ) ) {
149+ if ( boundary ) {
108150 yield {
109151 segment : input . slice ( index , cursor ) ,
110152 index,
@@ -114,23 +156,29 @@ export function* graphemeSegments(input) {
114156 _catEnd : catBefore ,
115157 } ;
116158
117- // flush
159+ // Reset segment state
118160 emoji = false ;
119- incb = false ;
161+ risCount = 0 ;
120162 index = cursor ;
121163 _catBegin = catAfter ;
122164 _hd = cp ;
123-
124- } else if ( cp >= 2325 ) {
125- // Note: Avoid InCB state checking much as possible
126- // Update InCB state only when continuing within a segment
127- if ( ! consonant && catBefore === 0 )
128- consonant = isIndicConjunctConsonant ( _hd ) ;
129-
130- if ( consonant && catAfter === 3 )
131- linker = isIndicConjunctLinker ( cp ) ;
132- else if ( catAfter === 0 )
133- linker = false ;
165+ } else {
166+ // Update state for continuing segment
167+
168+ // RI counting for GB12/13
169+ if ( catBefore === 10 ) {
170+ risCount ++ ;
171+ }
172+
173+ // InCB state for GB9c (only for Indic scripts, cp >= 2325)
174+ if ( cp >= 2325 ) {
175+ if ( ! consonant && catBefore === 0 ) {
176+ consonant = isIndicConjunctConsonant ( _hd ) ;
177+ }
178+ if ( consonant && catAfter === 3 ) {
179+ linker = linker || isIndicConjunctLinker ( cp ) ;
180+ }
181+ }
134182 }
135183
136184 cursor += cp <= BMP_MAX ? 1 : 2 ;
@@ -313,79 +361,3 @@ function isIndicConjunctLinker(cp) {
313361 cp === 3405 /* 0x0D4D */
314362 ) ;
315363}
316-
317- /**
318- * @param {GraphemeCategoryNum } catBefore
319- * @param {GraphemeCategoryNum } catAfter
320- * @param {number } risCount Regional_Indicator state
321- * @param {boolean } emoji Extended_Pictographic state
322- * @param {boolean } incb Indic_Conjunct_Break state
323- * @return {boolean }
324- *
325- * @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules
326- */
327- function isBoundary ( catBefore , catAfter , risCount , emoji , incb ) {
328- // GB3
329- if ( catBefore === 1 && catAfter === 6 ) {
330- return false ;
331- }
332-
333- // GB4
334- if ( catBefore === 1 || catBefore === 2 || catBefore === 6 ) {
335- return true ;
336- }
337-
338- // GB5
339- if ( catAfter === 1 || catAfter === 2 || catAfter === 6 ) {
340- return true ;
341- }
342-
343- // Most common cases - GB9, GB9a extend rules
344- if ( catAfter === 3 || catAfter === 14 || catAfter === 11 ) {
345- return false ;
346- }
347-
348- // GB6 - L x (L | V | LV | LVT)
349- if ( catBefore === 5 ) {
350- return ! ( catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13 ) ;
351- }
352-
353- // GB7 - (LV | V) x (V | T)
354- if (
355- ( catBefore === 7 || catBefore === 13 ) &&
356- ( catAfter === 13 || catAfter === 12 )
357- ) {
358- return false ;
359- }
360-
361- // GB8 - (LVT | T) x T
362- if (
363- ( catBefore === 8 || catBefore === 12 ) &&
364- catAfter === 12
365- ) {
366- return false ;
367- }
368-
369- // GB9b
370- if ( catBefore === 9 ) {
371- return false ;
372- }
373-
374- // GB9c
375- if ( catAfter === 0 && incb ) {
376- return false ;
377- }
378-
379- // GB11
380- if ( catBefore === 14 && catAfter === 4 ) {
381- return ! emoji ;
382- }
383-
384- // GB12, GB13
385- if ( catBefore === 10 && catAfter === 10 ) {
386- return risCount % 2 === 0 ;
387- }
388-
389- // GB999
390- return true ;
391- }
0 commit comments