@@ -437,13 +437,12 @@ internal struct _GraphemeBreakingState {
437
437
}
438
438
439
439
extension Unicode {
440
- /// A state machine for recognizing `Character` (i.e., extended grapheme
440
+ /// A state machine for recognizing character (i.e., extended grapheme
441
441
/// cluster) boundaries in an arbitrary series of Unicode scalars.
442
442
///
443
- /// The recognizer needs to be initialized with the first scalar in the
444
- /// series. Subsequent scalars must then be fed one by one to the
445
- /// `hasCharacterBoundary(before:)` method, which returns a Boolean value
446
- /// indicating whether the given scalar starts a new `Character`.
443
+ /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
444
+ /// them to the `hasBreak(before:)` method. The method returns true if the
445
+ /// sequence has a grapheme break preceding the given value.
447
446
///
448
447
/// The results produced by this state machine are guaranteed to match the way
449
448
/// `String` splits its contents into `Character` values.
@@ -454,9 +453,9 @@ extension Unicode {
454
453
internal var _state : _GraphemeBreakingState
455
454
456
455
/// Returns a non-nil value if it can be determined whether there is a
457
- /// `Character` break between `scalar1` and `scalar2` without knowing
458
- /// anything about the scalars that precede `scalar1`. This can be used as a
459
- /// fast (but incomplete) test before spinning up a full state machine
456
+ /// grapheme break between `scalar1` and `scalar2` without knowing anything
457
+ /// about the scalars that precede `scalar1`. This can optionally be used as
458
+ /// a fast (but incomplete) test before spinning up a full state machine
460
459
/// session.
461
460
@_effects ( releasenone)
462
461
public static func quickBreak(
@@ -472,22 +471,32 @@ extension Unicode {
472
471
return nil
473
472
}
474
473
475
- /// Initialize a new `Character` recognizer, feeding it the given value as
476
- /// the first Unicode scalar in the series. The state machine assumes that
477
- /// `first` is supposed to start a new extended grapheme cluster.
478
- public init ( first: Unicode . Scalar ) {
474
+ /// Initialize a new character recognizer at the _start of text_ (sot)
475
+ /// position.
476
+ ///
477
+ /// The resulting state machine will report a grapheme break on the
478
+ /// first scalar that is fed to it.
479
+ public init ( ) {
479
480
_state = _GraphemeBreakingState ( )
480
- _previous = first
481
+ // To avoid having to handle the empty case specially, we use NUL as the
482
+ // placeholder before the first scalar. NUL is a control character, so per
483
+ // rule GB5, it will induce an unconditional grapheme break before the
484
+ // first actual scalar, emulating GB1.
485
+ _previous = Unicode . Scalar ( 0 as UInt8 )
481
486
}
482
487
483
488
/// Feeds the next scalar to the state machine, returning a Boolean value
484
- /// indicating whether it starts a new `Character`.
489
+ /// indicating whether it starts a new extended grapheme cluster.
490
+ ///
491
+ /// This method will always report a break the first time it is called
492
+ /// on a newly initialized recognizer.
485
493
///
486
- /// The state machine does not carry information across `Character`
494
+ /// The state machine does not carry information across character
487
495
/// boundaries. I.e., if this method returns true, then `self` after the
488
- /// call is equivalent to `_CharacterRecognizer(first: next)`.
496
+ /// call is equivalent to feeding the same scalar to a newly initialized
497
+ /// recognizer instance.
489
498
@_effects ( releasenone)
490
- public mutating func hasCharacterBoundary (
499
+ public mutating func hasBreak (
491
500
before next: Unicode . Scalar
492
501
) -> Bool {
493
502
let r = _state. shouldBreak ( between: _previous, and: next)
0 commit comments