@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
101101}
102102
103103UTF8PROC_DLLEXPORT const char * utf8proc_unicode_version (void ) {
104- return "15.0 .0" ;
104+ return "15.1 .0" ;
105105}
106106
107107UTF8PROC_DLLEXPORT const char * utf8proc_errmsg (utf8proc_ssize_t errcode ) {
@@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
288288 true; // GB999
289289}
290290
291- static utf8proc_bool grapheme_break_extended (int lbc , int tbc , utf8proc_int32_t * state )
291+ static utf8proc_bool grapheme_break_extended (int lbc , int tbc , int licb , int ticb , utf8proc_int32_t * state )
292292{
293293 if (state ) {
294- int lbc_override ;
295- if (* state == UTF8PROC_BOUNDCLASS_START )
296- * state = lbc_override = lbc ;
297- else
298- lbc_override = * state ;
299- utf8proc_bool break_permitted = grapheme_break_simple (lbc_override , tbc );
294+ int state_bc , state_icb ; /* boundclass and indic_conjunct_break state */
295+ if (* state == 0 ) { /* state initialization */
296+ state_bc = lbc ;
297+ state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE ;
298+ }
299+ else { /* lbc and licb are already encoded in *state */
300+ state_bc = * state & 0xff ; // 1st byte of state is bound class
301+ state_icb = * state >> 8 ; // 2nd byte of state is indic conjunct break
302+ }
303+
304+ utf8proc_bool break_permitted = grapheme_break_simple (state_bc , tbc ) &&
305+ !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
306+ && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ); // GB9c
307+
308+ // Special support for GB9c. Don't break between two consonants
309+ // separated 1+ linker characters and 0+ extend characters in any order.
310+ // After a consonant, we enter LINKER state after at least one linker.
311+ if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
312+ || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
313+ || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND )
314+ state_icb = ticb ;
315+ else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER )
316+ state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
317+ UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb ;
300318
301319 // Special support for GB 12/13 made possible by GB999. After two RI
302320 // class codepoints we want to force a break. Do this by resetting the
303321 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
304322 // after that character according to GB999 (unless of course such a break is
305323 // forbidden by a different rule such as GB9).
306- if (* state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR )
307- * state = UTF8PROC_BOUNDCLASS_OTHER ;
324+ if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR )
325+ state_bc = UTF8PROC_BOUNDCLASS_OTHER ;
308326 // Special support for GB11 (emoji extend* zwj / emoji)
309- else if (* state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC ) {
327+ else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC ) {
310328 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND ) // fold EXTEND codepoints into emoji
311- * state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC ;
329+ state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC ;
312330 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ )
313- * state = UTF8PROC_BOUNDCLASS_E_ZWG ; // state to record emoji+zwg combo
331+ state_bc = UTF8PROC_BOUNDCLASS_E_ZWG ; // state to record emoji+zwg combo
314332 else
315- * state = tbc ;
333+ state_bc = tbc ;
316334 }
317335 else
318- * state = tbc ;
336+ state_bc = tbc ;
319337
338+ * state = state_bc + (state_icb << 8 );
320339 return break_permitted ;
321340 }
322341 else
@@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
326345UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful (
327346 utf8proc_int32_t c1 , utf8proc_int32_t c2 , utf8proc_int32_t * state ) {
328347
329- return grapheme_break_extended (utf8proc_get_property (c1 )-> boundclass ,
330- utf8proc_get_property (c2 )-> boundclass ,
348+ const utf8proc_property_t * p1 = utf8proc_get_property (c1 );
349+ const utf8proc_property_t * p2 = utf8proc_get_property (c2 );
350+ return grapheme_break_extended (p1 -> boundclass ,
351+ p2 -> boundclass ,
352+ p1 -> indic_conjunct_break ,
353+ p2 -> indic_conjunct_break ,
331354 state );
332355}
333356
@@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
498521 }
499522 if (options & UTF8PROC_CHARBOUND ) {
500523 utf8proc_bool boundary ;
501- int tbc = property -> boundclass ;
502- boundary = grapheme_break_extended ( * last_boundclass , tbc , last_boundclass );
524+ boundary = grapheme_break_extended ( 0 , property -> boundclass , 0 , property -> indic_conjunct_break ,
525+ last_boundclass );
503526 if (boundary ) {
504527 if (bufsize >= 1 ) dst [0 ] = -1 ; /* sentinel value for grapheme break */
505528 if (bufsize >= 2 ) dst [1 ] = uc ;
0 commit comments