@@ -59,52 +59,86 @@ export function* graphemeSegments(input) {
5959 /** Category of codepoint immediately preceding cursor */
6060 let catBefore = cat ( cp ) ;
6161
62- /** @type {GraphemeCategoryNum | null } Category of codepoint immediately preceding cursor. */
63- let catAfter = null ;
62+ /** @type {GraphemeCategoryNum } Category of codepoint immediately preceding cursor. */
63+ let catAfter = 0 ;
6464
6565 /** The number of RIS codepoints preceding `cursor`. */
6666 let risCount = 0 ;
6767
68- /** Emoji state */
68+ /**
69+ * Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
70+ * Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
71+ */
6972 let emoji = false ;
7073
71- /** InCB=Consonant */
74+ /** InCB=Consonant - segment started with Indic consonant */
7275 let consonant = false ;
7376
74- /** InCB=Linker */
77+ /** InCB=Linker - seen a linker after consonant */
7578 let linker = false ;
7679
77- /** InCB=Consonant InCB=Linker x InCB=Consonant */
78- let incb = false ;
79-
8080 let index = 0 ;
8181
8282 /** Beginning category of a segment */
8383 let _catBegin = catBefore ;
8484
85- /** Memoize the beginnig code point of the segment. */
85+ /** Memoize the beginning code point of the segment. */
8686 let _hd = cp ;
8787
8888 while ( cursor < len ) {
8989 cp = /** @type {number } */ ( input . codePointAt ( cursor ) ) ;
9090 catAfter = cat ( cp ) ;
9191
92- if ( catBefore === 10 /* Regional_Indicator */ ) {
93- risCount ++ ;
94- } else {
95- risCount = 0 ;
96- if (
97- catAfter === 14 /* ZWJ */
98- && ( catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */ )
99- ) {
100- emoji = true ;
92+ let boundary = true ;
10193
102- } else if ( catAfter === 0 ) {
103- incb = consonant && linker && isIndicConjunctConsonant ( cp ) ;
104- }
94+ // GB3: CR × LF
95+ if ( catBefore === 1 ) {
96+ boundary = catAfter !== 6 ;
97+ }
98+ // GB4: (Control | CR | LF) ÷
99+ else if ( catBefore === 2 || catBefore === 6 ) {
100+ boundary = true ;
101+ }
102+ // GB5: ÷ (Control | CR | LF)
103+ else if ( catAfter === 1 || catAfter === 2 || catAfter === 6 ) {
104+ boundary = true ;
105+ }
106+ // GB9, GB9a: × (Extend | ZWJ | SpacingMark) - most common no-break case
107+ else if ( catAfter === 3 || catAfter === 14 || catAfter === 11 ) {
108+ boundary = false ;
109+ }
110+ // GB9b: Prepend ×
111+ else if ( catBefore === 9 ) {
112+ boundary = false ;
113+ }
114+ // GB11: ExtPic Extend* ZWJ × ExtPic
115+ else if ( catBefore === 14 && catAfter === 4 ) {
116+ boundary = ! emoji ;
117+ }
118+ // GB12, GB13: RI × RI (odd count means no break)
119+ else if ( catBefore === 10 && catAfter === 10 ) {
120+ // risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121+ boundary = risCount ++ % 2 === 1 ;
122+ }
123+ // GB6: L × (L | V | LV | LVT)
124+ else if ( catBefore === 5 ) {
125+ boundary = ! ( catAfter === 5 || catAfter === 13 || catAfter === 7 || catAfter === 8 ) ;
105126 }
127+ // GB7: (LV | V) × (V | T)
128+ else if ( ( catBefore === 7 || catBefore === 13 ) && ( catAfter === 13 || catAfter === 12 ) ) {
129+ boundary = false ;
130+ }
131+ // GB8: (LVT | T) × T
132+ else if ( ( catBefore === 8 || catBefore === 12 ) && catAfter === 12 ) {
133+ boundary = false ;
134+ }
135+ // GB9c: InCB=Consonant InCB=Extend* InCB=Linker InCB=Extend* × InCB=Consonant
136+ else if ( catAfter === 0 && consonant && linker && isIndicConjunctConsonant ( cp ) ) {
137+ boundary = false ;
138+ }
139+ // else GB999: ÷ Any
106140
107- if ( isBoundary ( catBefore , catAfter , risCount , emoji , incb ) ) {
141+ if ( boundary ) {
108142 yield {
109143 segment : input . slice ( index , cursor ) ,
110144 index,
@@ -114,23 +148,30 @@ export function* graphemeSegments(input) {
114148 _catEnd : catBefore ,
115149 } ;
116150
117- // flush
151+ // Reset segment state
118152 emoji = false ;
119- incb = false ;
153+ risCount = 0 ;
120154 index = cursor ;
121155 _catBegin = catAfter ;
122156 _hd = cp ;
123-
124- } else if ( cp >= 2325 ) {
125- // Note: Avoid InCB state checking much as possible
126- // Update InCB state only when continuing within a segment
127- if ( ! consonant && catBefore === 0 )
128- consonant = isIndicConjunctConsonant ( _hd ) ;
129-
130- if ( consonant && catAfter === 3 )
131- linker = isIndicConjunctLinker ( cp ) ;
132- else if ( catAfter === 0 )
133- linker = false ;
157+ }
158+ // Update state for continuing segment
159+ else {
160+ // emoji state for GB11
161+ if ( catAfter === 14 && ( catBefore === 3 || catBefore === 4 ) ) {
162+ emoji = true ;
163+ }
164+ // InCB state for GB9c
165+ else if ( cp >= 2325 ) {
166+ if ( ! consonant && catBefore === 0 ) {
167+ consonant = isIndicConjunctConsonant ( _hd ) ;
168+ }
169+ if ( consonant && catAfter === 3 ) {
170+ linker = linker || isIndicConjunctLinker ( cp ) ;
171+ } else {
172+ linker = false ;
173+ }
174+ }
134175 }
135176
136177 cursor += cp <= BMP_MAX ? 1 : 2 ;
@@ -313,79 +354,3 @@ function isIndicConjunctLinker(cp) {
313354 cp === 3405 /* 0x0D4D */
314355 ) ;
315356}
316-
317- /**
318- * @param {GraphemeCategoryNum } catBefore
319- * @param {GraphemeCategoryNum } catAfter
320- * @param {number } risCount Regional_Indicator state
321- * @param {boolean } emoji Extended_Pictographic state
322- * @param {boolean } incb Indic_Conjunct_Break state
323- * @return {boolean }
324- *
325- * @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules
326- */
327- function isBoundary ( catBefore , catAfter , risCount , emoji , incb ) {
328- // GB3
329- if ( catBefore === 1 && catAfter === 6 ) {
330- return false ;
331- }
332-
333- // GB4
334- if ( catBefore === 1 || catBefore === 2 || catBefore === 6 ) {
335- return true ;
336- }
337-
338- // GB5
339- if ( catAfter === 1 || catAfter === 2 || catAfter === 6 ) {
340- return true ;
341- }
342-
343- // Most common cases - GB9, GB9a extend rules
344- if ( catAfter === 3 || catAfter === 14 || catAfter === 11 ) {
345- return false ;
346- }
347-
348- // GB6 - L x (L | V | LV | LVT)
349- if ( catBefore === 5 ) {
350- return ! ( catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13 ) ;
351- }
352-
353- // GB7 - (LV | V) x (V | T)
354- if (
355- ( catBefore === 7 || catBefore === 13 ) &&
356- ( catAfter === 13 || catAfter === 12 )
357- ) {
358- return false ;
359- }
360-
361- // GB8 - (LVT | T) x T
362- if (
363- ( catBefore === 8 || catBefore === 12 ) &&
364- catAfter === 12
365- ) {
366- return false ;
367- }
368-
369- // GB9b
370- if ( catBefore === 9 ) {
371- return false ;
372- }
373-
374- // GB9c
375- if ( catAfter === 0 && incb ) {
376- return false ;
377- }
378-
379- // GB11
380- if ( catBefore === 14 && catAfter === 4 ) {
381- return ! emoji ;
382- }
383-
384- // GB12, GB13
385- if ( catBefore === 10 && catAfter === 10 ) {
386- return risCount % 2 === 0 ;
387- }
388-
389- // GB999
390- return true ;
391- }
0 commit comments