@@ -394,9 +394,48 @@ bool isGraphemeClusterBoundary(String text, int start, int end, int index) {
394394 // The backwards automaton is built for this use case.
395395 // Most of the apparent complication in this function is merely dealing with
396396 // surrogates.
397- if (start <= index && index < end) {
398- var next = nextBreak (text, start, end, index);
399- return next == index;
397+ if (start < index && index < end) {
398+ int prevCategory, nextCategory;
399+ var cursorBefore = index - 1 ;
400+ var prevChar = text.codeUnitAt (cursorBefore);
401+ var nextChar = text.codeUnitAt (index);
402+ if (prevChar & 0xF800 != 0xD800 ) {
403+ prevCategory = low (prevChar);
404+ } else if (prevChar & 0xFC00 == 0xD800 ) {
405+ // Either not a break because it's in the middle of a surrogate pair,
406+ // or always a break after an unpaired surrogate.
407+ return nextChar & 0xFC00 != 0xDC00 ;
408+ } else if (start < cursorBefore) {
409+ assert (prevChar & 0xFC00 == 0xDC00 );
410+ var headChar = text.codeUnitAt (-- cursorBefore);
411+ if (headChar & 0xFC00 != 0xD800 ) {
412+ // Always break after unpaired tail surrogate.
413+ return true ;
414+ }
415+ prevCategory = high (headChar, prevChar);
416+ } else {
417+ // Break after unpaired tail surrogate.
418+ return true ;
419+ }
420+ if (nextChar & 0xF800 != 0xD800 ) {
421+ nextCategory = low (nextChar);
422+ } else if (nextChar & 0xFC00 == 0xD800 && index + 1 < end) {
423+ var tailChar = text.codeUnitAt (index + 1 );
424+ if (tailChar & 0xFC00 != 0xDC00 ) {
425+ // Always break before unpaired head surrogate.
426+ return true ;
427+ }
428+ nextCategory = high (nextChar, tailChar);
429+ } else {
430+ // Always break before unpaired tail surrogate.
431+ return true ;
432+ }
433+ var state = move (move (stateCAny, prevCategory), nextCategory);
434+ if (state & maskBreak != flagNoBreak) {
435+ return true ;
436+ }
437+ if (state & maskLookahead == 0 ) return false ;
438+ return _lookaheadSimple (text, start, cursorBefore, state);
400439 }
401440 return true ;
402441}
@@ -479,207 +518,88 @@ int nextBreak(String text, int start, int end, int index) {
479518 // The `prevCategory` now is the category for the character
480519 // from `indexBefore` to `index`, and `index` is the minium valid
481520 // return value (earliest next break).
482-
483- var state = stateSoTNoBreak;
484- // In many cases, the state at `index` can be predicted precisely
485- // from just the one prior character. That's the case for every
486- // input category where the output state doesn't depend on the input
487- // state (ignoring whether it breaks before the previous character or not).
488- //
489- // The cases where that is not possible are:
490- // - A regional indicator (need to know if there is an even or odd number
491- // of regional indicators before that).
492- // - An Extend{InCB=None|Extend|Linked} or ZWJ
493- // - For Extends{...} and ZWJ, it behaves differently if in state Pic
494- // (after Pictographic+Extend*).
495- // - For {InCB=Extend|Linked} or ZWJ (which is InCB=Extend),
496- // it behaves differently if in state InC or InCL, so after
497- // Other(InCB=Consonant)+(InCB={Extend|Linked})*
498- // In those cases, check the next character first. It may make the look-behind
499- // unnecessary, if it's a character that guarantees a break.
500- // Otherwise look-behind to see if the prior characters are `Pic+Extend`
501- // or `InCB=Consonant+InCB={Extend+Linked}` (and whether at least one Linked).
502- if (prevCategory == categoryRegionalIndicator ||
503- prevCategory == categoryExtend ||
504- prevCategory >= categoryZWJ) {
505- // >= ZWJ implies ZWJ|Extend(InCB={Extend|Linked}) for an input character.
506- // Only higher categories are synthetic EoT/SoT characters.
507-
508- // TODO: Can this be made into an automaton?
509-
510- var indexAfter = index + 1 ;
521+ var state = move (stateCAny, prevCategory);
522+ while (index < end) {
511523 var nextChar = text.codeUnitAt (index);
512- int nextCategory;
513- if (nextChar & 0xFC00 != 0xD800 ) {
514- nextCategory = low (nextChar);
515- } else if (indexAfter < end) {
516- // Lead surrogate.
517- var tailChar = text.codeUnitAt (indexAfter);
518- if (tailChar & 0xFC00 == 0xDC00 ) {
519- indexAfter += 1 ;
520- nextCategory = high (nextChar, tailChar);
521- } else {
522- return index; // Unpaired surrogate
524+ var nextIndex = index + 1 ;
525+ int category;
526+ if (nextChar & 0xFC00 != 0xD800 || nextIndex == end) {
527+ category = low (nextChar);
528+ state = move (state, category);
529+ if (state & maskFlags == flagNoBreak) {
530+ index++ ;
531+ continue ;
523532 }
524533 } else {
525- return index; // Unpaired surrogate, treat as control.
526- }
527-
528- // The `nextCategory is the category of the character at positions
529- // from `index` to `indexAfter`.
530-
531- if (prevCategory == categoryRegionalIndicator) {
532- if (nextCategory == categoryRegionalIndicator) {
533- // Prev = RI, next = RI.
534- var idStateBefore = lookaheadRegional (text, start, indexBefore);
535- if (idStateBefore & maskBreak != flagNoBreak) {
536- // Break after previous character.
537- return index;
538- }
539- state = stateOther;
540- // Move index to after RI+RI.
541- index = indexAfter;
542- } else {
543- state = move (stateOther, nextCategory);
544- if (state & maskBreak != flagNoBreak) return index;
545- index = indexAfter;
546- }
547- } else if (prevCategory == categoryZWJ) {
548- if (nextCategory == categoryPictographic) {
549- var prevPic = lookaheadPictographicExtend (text, start, indexBefore);
550- if (prevPic < 0 ) {
551- return index;
552- }
553- state = statePictographic;
554- } else {
555- // Default for ZWJ if not after Pic+Ext* or
556- // InCB=Consonant+InCB={Extend|Linked}*
557- state = stateOther;
558- if (nextCategory >= categoryOtherIndicConsonant) {
559- var prevConsonant = lookaheadInCBLinkedConsonant (
560- text, start, indexBefore, prevCategory);
561- if (prevConsonant >= 0 ) {
562- state = prevConsonant.isOdd ? stateInCL : stateInC;
563- }
564- }
565- state = move (state, nextCategory);
566- if (state & maskBreak != flagNoBreak) return index;
567- }
568- } else if (prevCategory == categoryExtend ||
569- nextCategory == categoryExtend) {
570- // The `categoryExtend` has InCB=None, so not affected by GB9c, only GB9b.
571-
572- // At this point `prevCategory` is definitely an Extend.
573- // Do Pictographic lookbehind if `nextCategory` is any Extend or ZWJ.
574- assert (prevCategory == categoryExtend ||
575- prevCategory == categoryExtendIndicExtend ||
576- prevCategory == categoryExtendIndicLinked);
577- state = stateOther;
578- if (nextCategory == categoryExtend || nextCategory >= categoryZWJ) {
579- // Look behind for Pic+Ext*.
580- var prevPic = lookaheadPictographicExtend (text, start, indexBefore);
581- if (prevPic >= 0 ) {
582- state = statePictographic;
583- }
584- }
585- state = move (state, nextCategory);
586- if (state & maskBreak != flagNoBreak) return index;
587- } else if (nextCategory == categoryOtherIndicConsonant) {
588- assert (prevCategory >= categoryExtendIndicExtend);
589- var prevConsonant =
590- lookaheadInCBLinkedConsonant (text, start, indexBefore, prevCategory);
591- if (prevConsonant.isEven) {
592- return index;
534+ var tail = text.codeUnitAt (nextIndex);
535+ category = categoryControl;
536+ if (tail & 0xFC00 == 0xDC00 ) {
537+ nextIndex += 1 ;
538+ category = high (nextChar, tail);
593539 }
594- state = stateInC | flagNoBreak;
595- } else if (nextCategory >= categoryZWJ) {
596- assert (prevCategory >= categoryExtendIndicExtend);
597- // It's all Extend{InCB!=None}, can't say whether to look for
598- // Pic or InCB=Consonant.
599- state = lookaheadPictographicExtendOrIndic (
600- text, start, indexBefore, prevCategory, nextCategory);
601- if (state & maskBreak != flagNoBreak) {
602- return index;
540+ state = move (state, category);
541+ if (state & maskFlags == flagNoBreak) {
542+ index = nextIndex;
543+ continue ;
603544 }
545+ }
546+ if (state & maskFlags == flagBreak) return index;
547+ assert (state & maskFlags == flagLookahead);
548+
549+ if (_lookaheadSimple (text, start, indexBefore, state)) return index;
550+
551+ // Find the correct forward category.
552+ // There are only three possible character categories that can trigger
553+ // a look-behind.
554+ if (category == categoryRegionalIndicator) {
555+ assert (state == stateLookaheadRegionalEven | flagLookahead);
556+ // Started by RI+RI.
557+ state = stateRegionalEven;
558+ } else if (category == categoryOtherIndicConsonant) {
559+ assert (
560+ state == (stateLookaheadInC | flagLookahead) ||
561+ state == (stateLookaheadInCL | flagLookahead),
562+ state);
563+ state = stateInC;
604564 } else {
605- // Doesn't need further lookahead, one character is enough.
606- state = move (stateSoTNoBreak, prevCategory);
607- state = move (state, nextCategory);
608- if (state & maskBreak != flagNoBreak) return index;
565+ assert (category == categoryPictographic);
566+ assert (state == (stateLookaheadZWJPictographic | flagLookahead));
567+ state = statePictographic;
609568 }
610- index = indexAfter;
611- } else {
612- // Just look at one prior character.
613- state = move (stateSoTNoBreak, prevCategory);
569+ index = nextIndex;
614570 }
615- // Break wasn't at index, so move forward until finding the break.
616- return Breaks (text, index, text.length, state). nextBreak () ;
571+ assert ( index == end);
572+ return index;
617573}
618574
619- // Look behind for a Pic+Ext+ZWJ? or Consonant+(Extend|Linked)* sequence.
620- // Given the last two categories.
621- //
622- // The categories are ones that allow both prefixes, so
623- // [category1] is either [categoryExtendInCBExtend] or
624- // [categoryExtendInCBLinked], and [category2] is one of those or [categoryZWJ].
625- //
626- // Returns the state after `category2`, with the break flag reporting
627- // whether to break before `category2` or not.
628- int lookaheadPictographicExtendOrIndic (
629- String text, int start, int cursor, int category1, int category2) {
630- assert (
631- category1 == categoryExtendIndicExtend ||
632- category1 == categoryExtendIndicLinked,
633- category1);
634- assert (
635- category2 == categoryZWJ ||
636- category2 == categoryExtendIndicExtend ||
637- category2 == categoryExtendIndicLinked,
638- category2);
639- var linked = (category1 == categoryExtendIndicLinked ||
640- category2 == categoryExtendIndicLinked)
641- ? (stateInCL | flagNoBreak)
642- : (stateInC | flagNoBreak);
643- loop:
575+ /// Whether to break before a later character.
576+ ///
577+ /// Used only to find grapheme category breaks, not part of moving forwards
578+ /// or backwards from known breaks.
579+ ///
580+ /// That character is always one of [categoryOtherIndicConsonant] ,
581+ /// [categoryPictorgraphic] or [categoryRegionalIndicator] , the only
582+ /// characters where knowing whether to break before them depends on
583+ /// more than the single prior character.
584+ bool _lookaheadSimple (String text, int start, int cursor, int backState) {
644585 while (cursor > start) {
645- int category ;
646- var char = text. codeUnitAt ( -- cursor);
647- if (char & 0xFC00 != 0xDC00 ) {
648- category = low (char) ;
586+ var prevChar = text. codeUnitAt ( -- cursor) ;
587+ if (prevChar & 0xFC00 != 0xDC00 || cursor == start) {
588+ backState = moveBack (backState, low (prevChar));
589+ if (backState >= stateLookaheadMin) continue ;
649590 } else {
650- if (cursor <= start) break ;
651- var head = text.codeUnitAt (-- cursor);
652- if (head & 0xFC00 != 0xD800 ) break ;
653- category = high (head, char);
654- }
655- switch (category) {
656- case categoryExtend:
657- var prevPic = lookaheadPictographicExtend (text, start, cursor);
658- if (prevPic < 0 ) break loop;
659- continue pictographic;
660- pictographic:
661- case categoryPictographic:
662- return category2 == categoryZWJ
663- ? (statePictographicZWJ | flagNoBreak)
664- : (statePictographic | flagNoBreak);
665- case categoryZWJ:
666- var prevConsonant =
667- lookaheadInCBLinkedConsonant (text, start, cursor, category);
668- if (prevConsonant < 0 ) break loop;
669- if (prevConsonant.isOdd) {
670- return stateInCL | flagNoBreak;
671- }
672- return linked;
673- case categoryOtherIndicConsonant:
674- return linked;
675- case categoryExtendIndicLinked:
676- linked = stateInCL | flagNoBreak;
677- case categoryExtendIndicExtend:
678- break ; // No change.
679- default :
680- break loop;
591+ var headChar = text.codeUnitAt (-- cursor);
592+ int category;
593+ if (headChar & 0xFC00 == 0xD800 ) {
594+ category = high (headChar, prevChar);
595+ } else {
596+ category = categoryControl;
597+ cursor++ ;
598+ }
599+ backState = moveBack (backState, category);
600+ if (backState >= stateLookaheadMin) continue ;
681601 }
602+ return (backState & maskBreak != flagNoBreak);
682603 }
683- // Default behavior if no Pic or InCB=Consonant found.
684- return move (move (stateSoTNoBreak, category1), category2);
604+ return moveBack (backState, categorySoT) & maskBreak != flagNoBreak;
685605}
0 commit comments