Skip to content

Commit 6158e75

Browse files
committed
Turned all look-ahead into tables.
1 parent 32468bf commit 6158e75

File tree

6 files changed

+495
-297
lines changed

6 files changed

+495
-297
lines changed

pkgs/characters/lib/src/grapheme_clusters/breaks.dart

Lines changed: 112 additions & 192 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,48 @@ bool isGraphemeClusterBoundary(String text, int start, int end, int index) {
394394
// The backwards automaton is built for this use case.
395395
// Most of the apparent complication in this function is merely dealing with
396396
// surrogates.
397-
if (start <= index && index < end) {
398-
var next = nextBreak(text, start, end, index);
399-
return next == index;
397+
if (start < index && index < end) {
398+
int prevCategory, nextCategory;
399+
var cursorBefore = index - 1;
400+
var prevChar = text.codeUnitAt(cursorBefore);
401+
var nextChar = text.codeUnitAt(index);
402+
if (prevChar & 0xF800 != 0xD800) {
403+
prevCategory = low(prevChar);
404+
} else if (prevChar & 0xFC00 == 0xD800) {
405+
// Either not a break because it's in the middle of a surrogate pair,
406+
// or always a break after an unpaired surrogate.
407+
return nextChar & 0xFC00 != 0xDC00;
408+
} else if (start < cursorBefore) {
409+
assert(prevChar & 0xFC00 == 0xDC00);
410+
var headChar = text.codeUnitAt(--cursorBefore);
411+
if (headChar & 0xFC00 != 0xD800) {
412+
// Always break after unpaired tail surrogate.
413+
return true;
414+
}
415+
prevCategory = high(headChar, prevChar);
416+
} else {
417+
// Break after unpaired tail surrogate.
418+
return true;
419+
}
420+
if (nextChar & 0xF800 != 0xD800) {
421+
nextCategory = low(nextChar);
422+
} else if (nextChar & 0xFC00 == 0xD800 && index + 1 < end) {
423+
var tailChar = text.codeUnitAt(index + 1);
424+
if (tailChar & 0xFC00 != 0xDC00) {
425+
// Always break before unpaired head surrogate.
426+
return true;
427+
}
428+
nextCategory = high(nextChar, tailChar);
429+
} else {
430+
// Always break before unpaired tail surrogate.
431+
return true;
432+
}
433+
var state = move(move(stateCAny, prevCategory), nextCategory);
434+
if (state & maskBreak != flagNoBreak) {
435+
return true;
436+
}
437+
if (state & maskLookahead == 0) return false;
438+
return _lookaheadSimple(text, start, cursorBefore, state);
400439
}
401440
return true;
402441
}
@@ -479,207 +518,88 @@ int nextBreak(String text, int start, int end, int index) {
479518
// The `prevCategory` now is the category for the character
480519
// from `indexBefore` to `index`, and `index` is the minium valid
481520
// return value (earliest next break).
482-
483-
var state = stateSoTNoBreak;
484-
// In many cases, the state at `index` can be predicted precisely
485-
// from just the one prior character. That's the case for every
486-
// input category where the output state doesn't depend on the input
487-
// state (ignoring whether it breaks before the previous character or not).
488-
//
489-
// The cases where that is not possible are:
490-
// - A regional indicator (need to know if there is an even or odd number
491-
// of regional indicators before that).
492-
// - An Extend{InCB=None|Extend|Linked} or ZWJ
493-
// - For Extends{...} and ZWJ, it behaves differently if in state Pic
494-
// (after Pictographic+Extend*).
495-
// - For {InCB=Extend|Linked} or ZWJ (which is InCB=Extend),
496-
// it behaves differently if in state InC or InCL, so after
497-
// Other(InCB=Consonant)+(InCB={Extend|Linked})*
498-
// In those cases, check the next character first. It may make the look-behind
499-
// unnecessary, if it's a character that guarantees a break.
500-
// Otherwise look-behind to see if the prior characters are `Pic+Extend`
501-
// or `InCB=Consonant+InCB={Extend+Linked}` (and whether at least one Linked).
502-
if (prevCategory == categoryRegionalIndicator ||
503-
prevCategory == categoryExtend ||
504-
prevCategory >= categoryZWJ) {
505-
// >= ZWJ implies ZWJ|Extend(InCB={Extend|Linked}) for an input character.
506-
// Only higher categories are synthetic EoT/SoT characters.
507-
508-
// TODO: Can this be made into an automaton?
509-
510-
var indexAfter = index + 1;
521+
var state = move(stateCAny, prevCategory);
522+
while (index < end) {
511523
var nextChar = text.codeUnitAt(index);
512-
int nextCategory;
513-
if (nextChar & 0xFC00 != 0xD800) {
514-
nextCategory = low(nextChar);
515-
} else if (indexAfter < end) {
516-
// Lead surrogate.
517-
var tailChar = text.codeUnitAt(indexAfter);
518-
if (tailChar & 0xFC00 == 0xDC00) {
519-
indexAfter += 1;
520-
nextCategory = high(nextChar, tailChar);
521-
} else {
522-
return index; // Unpaired surrogate
524+
var nextIndex = index + 1;
525+
int category;
526+
if (nextChar & 0xFC00 != 0xD800 || nextIndex == end) {
527+
category = low(nextChar);
528+
state = move(state, category);
529+
if (state & maskFlags == flagNoBreak) {
530+
index++;
531+
continue;
523532
}
524533
} else {
525-
return index; // Unpaired surrogate, treat as control.
526-
}
527-
528-
// The `nextCategory is the category of the character at positions
529-
// from `index` to `indexAfter`.
530-
531-
if (prevCategory == categoryRegionalIndicator) {
532-
if (nextCategory == categoryRegionalIndicator) {
533-
// Prev = RI, next = RI.
534-
var idStateBefore = lookaheadRegional(text, start, indexBefore);
535-
if (idStateBefore & maskBreak != flagNoBreak) {
536-
// Break after previous character.
537-
return index;
538-
}
539-
state = stateOther;
540-
// Move index to after RI+RI.
541-
index = indexAfter;
542-
} else {
543-
state = move(stateOther, nextCategory);
544-
if (state & maskBreak != flagNoBreak) return index;
545-
index = indexAfter;
546-
}
547-
} else if (prevCategory == categoryZWJ) {
548-
if (nextCategory == categoryPictographic) {
549-
var prevPic = lookaheadPictographicExtend(text, start, indexBefore);
550-
if (prevPic < 0) {
551-
return index;
552-
}
553-
state = statePictographic;
554-
} else {
555-
// Default for ZWJ if not after Pic+Ext* or
556-
// InCB=Consonant+InCB={Extend|Linked}*
557-
state = stateOther;
558-
if (nextCategory >= categoryOtherIndicConsonant) {
559-
var prevConsonant = lookaheadInCBLinkedConsonant(
560-
text, start, indexBefore, prevCategory);
561-
if (prevConsonant >= 0) {
562-
state = prevConsonant.isOdd ? stateInCL : stateInC;
563-
}
564-
}
565-
state = move(state, nextCategory);
566-
if (state & maskBreak != flagNoBreak) return index;
567-
}
568-
} else if (prevCategory == categoryExtend ||
569-
nextCategory == categoryExtend) {
570-
// The `categoryExtend` has InCB=None, so not affected by GB9c, only GB9b.
571-
572-
// At this point `prevCategory` is definitely an Extend.
573-
// Do Pictographic lookbehind if `nextCategory` is any Extend or ZWJ.
574-
assert(prevCategory == categoryExtend ||
575-
prevCategory == categoryExtendIndicExtend ||
576-
prevCategory == categoryExtendIndicLinked);
577-
state = stateOther;
578-
if (nextCategory == categoryExtend || nextCategory >= categoryZWJ) {
579-
// Look behind for Pic+Ext*.
580-
var prevPic = lookaheadPictographicExtend(text, start, indexBefore);
581-
if (prevPic >= 0) {
582-
state = statePictographic;
583-
}
584-
}
585-
state = move(state, nextCategory);
586-
if (state & maskBreak != flagNoBreak) return index;
587-
} else if (nextCategory == categoryOtherIndicConsonant) {
588-
assert(prevCategory >= categoryExtendIndicExtend);
589-
var prevConsonant =
590-
lookaheadInCBLinkedConsonant(text, start, indexBefore, prevCategory);
591-
if (prevConsonant.isEven) {
592-
return index;
534+
var tail = text.codeUnitAt(nextIndex);
535+
category = categoryControl;
536+
if (tail & 0xFC00 == 0xDC00) {
537+
nextIndex += 1;
538+
category = high(nextChar, tail);
593539
}
594-
state = stateInC | flagNoBreak;
595-
} else if (nextCategory >= categoryZWJ) {
596-
assert(prevCategory >= categoryExtendIndicExtend);
597-
// It's all Extend{InCB!=None}, can't say whether to look for
598-
// Pic or InCB=Consonant.
599-
state = lookaheadPictographicExtendOrIndic(
600-
text, start, indexBefore, prevCategory, nextCategory);
601-
if (state & maskBreak != flagNoBreak) {
602-
return index;
540+
state = move(state, category);
541+
if (state & maskFlags == flagNoBreak) {
542+
index = nextIndex;
543+
continue;
603544
}
545+
}
546+
if (state & maskFlags == flagBreak) return index;
547+
assert(state & maskFlags == flagLookahead);
548+
549+
if (_lookaheadSimple(text, start, indexBefore, state)) return index;
550+
551+
// Find the correct forward category.
552+
// There are only three possible character categories that can trigger
553+
// a look-behind.
554+
if (category == categoryRegionalIndicator) {
555+
assert(state == stateLookaheadRegionalEven | flagLookahead);
556+
// Started by RI+RI.
557+
state = stateRegionalEven;
558+
} else if (category == categoryOtherIndicConsonant) {
559+
assert(
560+
state == (stateLookaheadInC | flagLookahead) ||
561+
state == (stateLookaheadInCL | flagLookahead),
562+
state);
563+
state = stateInC;
604564
} else {
605-
// Doesn't need further lookahead, one character is enough.
606-
state = move(stateSoTNoBreak, prevCategory);
607-
state = move(state, nextCategory);
608-
if (state & maskBreak != flagNoBreak) return index;
565+
assert(category == categoryPictographic);
566+
assert(state == (stateLookaheadZWJPictographic | flagLookahead));
567+
state = statePictographic;
609568
}
610-
index = indexAfter;
611-
} else {
612-
// Just look at one prior character.
613-
state = move(stateSoTNoBreak, prevCategory);
569+
index = nextIndex;
614570
}
615-
// Break wasn't at index, so move forward until finding the break.
616-
return Breaks(text, index, text.length, state).nextBreak();
571+
assert(index == end);
572+
return index;
617573
}
618574

619-
// Look behind for a Pic+Ext+ZWJ? or Consonant+(Extend|Linked)* sequence.
620-
// Given the last two categories.
621-
//
622-
// The categories are ones that allow both prefixes, so
623-
// [category1] is either [categoryExtendInCBExtend] or
624-
// [categoryExtendInCBLinked], and [category2] is one of those or [categoryZWJ].
625-
//
626-
// Returns the state after `category2`, with the break flag reporting
627-
// whether to break before `category2` or not.
628-
int lookaheadPictographicExtendOrIndic(
629-
String text, int start, int cursor, int category1, int category2) {
630-
assert(
631-
category1 == categoryExtendIndicExtend ||
632-
category1 == categoryExtendIndicLinked,
633-
category1);
634-
assert(
635-
category2 == categoryZWJ ||
636-
category2 == categoryExtendIndicExtend ||
637-
category2 == categoryExtendIndicLinked,
638-
category2);
639-
var linked = (category1 == categoryExtendIndicLinked ||
640-
category2 == categoryExtendIndicLinked)
641-
? (stateInCL | flagNoBreak)
642-
: (stateInC | flagNoBreak);
643-
loop:
575+
/// Whether to break before a later character.
576+
///
577+
/// Used only to find grapheme category breaks, not part of moving forwards
578+
/// or backwards from known breaks.
579+
///
580+
/// That character is always one of [categoryOtherIndicConsonant],
581+
/// [categoryPictorgraphic] or [categoryRegionalIndicator], the only
582+
/// characters where knowing whether to break before them depends on
583+
/// more than the single prior character.
584+
bool _lookaheadSimple(String text, int start, int cursor, int backState) {
644585
while (cursor > start) {
645-
int category;
646-
var char = text.codeUnitAt(--cursor);
647-
if (char & 0xFC00 != 0xDC00) {
648-
category = low(char);
586+
var prevChar = text.codeUnitAt(--cursor);
587+
if (prevChar & 0xFC00 != 0xDC00 || cursor == start) {
588+
backState = moveBack(backState, low(prevChar));
589+
if (backState >= stateLookaheadMin) continue;
649590
} else {
650-
if (cursor <= start) break;
651-
var head = text.codeUnitAt(--cursor);
652-
if (head & 0xFC00 != 0xD800) break;
653-
category = high(head, char);
654-
}
655-
switch (category) {
656-
case categoryExtend:
657-
var prevPic = lookaheadPictographicExtend(text, start, cursor);
658-
if (prevPic < 0) break loop;
659-
continue pictographic;
660-
pictographic:
661-
case categoryPictographic:
662-
return category2 == categoryZWJ
663-
? (statePictographicZWJ | flagNoBreak)
664-
: (statePictographic | flagNoBreak);
665-
case categoryZWJ:
666-
var prevConsonant =
667-
lookaheadInCBLinkedConsonant(text, start, cursor, category);
668-
if (prevConsonant < 0) break loop;
669-
if (prevConsonant.isOdd) {
670-
return stateInCL | flagNoBreak;
671-
}
672-
return linked;
673-
case categoryOtherIndicConsonant:
674-
return linked;
675-
case categoryExtendIndicLinked:
676-
linked = stateInCL | flagNoBreak;
677-
case categoryExtendIndicExtend:
678-
break; // No change.
679-
default:
680-
break loop;
591+
var headChar = text.codeUnitAt(--cursor);
592+
int category;
593+
if (headChar & 0xFC00 == 0xD800) {
594+
category = high(headChar, prevChar);
595+
} else {
596+
category = categoryControl;
597+
cursor++;
598+
}
599+
backState = moveBack(backState, category);
600+
if (backState >= stateLookaheadMin) continue;
681601
}
602+
return (backState & maskBreak != flagNoBreak);
682603
}
683-
// Default behavior if no Pic or InCB=Consonant found.
684-
return move(move(stateSoTNoBreak, category1), category2);
604+
return moveBack(backState, categorySoT) & maskBreak != flagNoBreak;
685605
}

0 commit comments

Comments
 (0)