Skip to content

Commit 9922b00

Browse files
committed
[stdlib] _CharacterRecognizer: Remove initializer argument
1 parent 699a3f0 commit 9922b00

File tree

2 files changed

+39
-34
lines changed

2 files changed

+39
-34
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -437,13 +437,12 @@ internal struct _GraphemeBreakingState {
437437
}
438438

439439
extension Unicode {
440-
/// A state machine for recognizing `Character` (i.e., extended grapheme
440+
/// A state machine for recognizing character (i.e., extended grapheme
441441
/// cluster) boundaries in an arbitrary series of Unicode scalars.
442442
///
443-
/// The recognizer needs to be initialized with the first scalar in the
444-
/// series. Subsequent scalars must then be fed one by one to the
445-
/// `hasCharacterBoundary(before:)` method, which returns a Boolean value
446-
/// indicating whether the given scalar starts a new `Character`.
443+
/// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
444+
/// them to the `hasBreak(before:)` method. The method returns true if the
445+
/// sequence has a grapheme break preceding the given value.
447446
///
448447
/// The results produced by this state machine are guaranteed to match the way
449448
/// `String` splits its contents into `Character` values.
@@ -454,9 +453,9 @@ extension Unicode {
454453
internal var _state: _GraphemeBreakingState
455454

456455
/// Returns a non-nil value if it can be determined whether there is a
457-
/// `Character` break between `scalar1` and `scalar2` without knowing
458-
/// anything about the scalars that precede `scalar1`. This can be used as a
459-
/// fast (but incomplete) test before spinning up a full state machine
456+
/// grapheme break between `scalar1` and `scalar2` without knowing anything
457+
/// about the scalars that precede `scalar1`. This can optionally be used as
458+
/// a fast (but incomplete) test before spinning up a full state machine
460459
/// session.
461460
@_effects(releasenone)
462461
public static func quickBreak(
@@ -472,22 +471,32 @@ extension Unicode {
472471
return nil
473472
}
474473

475-
/// Initialize a new `Character` recognizer, feeding it the given value as
476-
/// the first Unicode scalar in the series. The state machine assumes that
477-
/// `first` is supposed to start a new extended grapheme cluster.
478-
public init(first: Unicode.Scalar) {
474+
/// Initialize a new character recognizer at the _start of text_ (sot)
475+
/// position.
476+
///
477+
/// The resulting state machine will report a grapheme break on the
478+
/// first scalar that is fed to it.
479+
public init() {
479480
_state = _GraphemeBreakingState()
480-
_previous = first
481+
// To avoid having to handle the empty case specially, we use NUL as the
482+
// placeholder before the first scalar. NUL is a control character, so per
483+
// rule GB5, it will induce an unconditional grapheme break before the
484+
// first actual scalar, emulating GB1.
485+
_previous = Unicode.Scalar(0 as UInt8)
481486
}
482487

483488
/// Feeds the next scalar to the state machine, returning a Boolean value
484-
/// indicating whether it starts a new `Character`.
489+
/// indicating whether it starts a new extended grapheme cluster.
490+
///
491+
/// This method will always report a break the first time it is called
492+
/// on a newly initialized recognizer.
485493
///
486-
/// The state machine does not carry information across `Character`
494+
/// The state machine does not carry information across character
487495
/// boundaries. I.e., if this method returns true, then `self` after the
488-
/// call is equivalent to `_CharacterRecognizer(first: next)`.
496+
/// call is equivalent to feeding the same scalar to a newly initialized
497+
/// recognizer instance.
489498
@_effects(releasenone)
490-
public mutating func hasCharacterBoundary(
499+
public mutating func hasBreak(
491500
before next: Unicode.Scalar
492501
) -> Bool {
493502
let r = _state.shouldBreak(between: _previous, and: next)

test/stdlib/CharacterRecognizer.swift

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,17 @@ var suite = TestSuite("CharacterRecognizer")
1313
defer { runAllTests() }
1414

1515
if #available(SwiftStdlib 5.8, *) {
16-
suite.test("Unicode test data") {
16+
suite.test("Unicode test data/hasBreak") {
1717
for test in graphemeBreakTests {
18-
var it = test.string.unicodeScalars.makeIterator()
19-
guard let first = it.next() else { continue }
20-
var recognizer = Unicode._CharacterRecognizer(first: first)
18+
var recognizer = Unicode._CharacterRecognizer()
2119
var pieces: [[Unicode.Scalar]] = []
22-
var piece: [Unicode.Scalar] = [first]
23-
while let next = it.next() {
24-
if recognizer.hasCharacterBoundary(before: next) {
25-
pieces.append(piece)
26-
piece = [next]
20+
var piece: [Unicode.Scalar] = []
21+
for scalar in test.string.unicodeScalars {
22+
if recognizer.hasBreak(before: scalar) {
23+
if !piece.isEmpty { pieces.append(piece) }
24+
piece = [scalar]
2725
} else {
28-
piece.append(next)
26+
piece.append(scalar)
2927
}
3028
}
3129
if !piece.isEmpty { pieces.append(piece) }
@@ -57,15 +55,13 @@ if #available(SwiftStdlib 5.8, *) {
5755
let expectedBreaks = Array(sampleString.indices)
5856

5957
let u = sampleString.unicodeScalars
60-
var i = u.startIndex
61-
var actualBreaks = [i]
62-
var recognizer = Unicode._CharacterRecognizer(first: u[i])
63-
u.formIndex(after: &i)
64-
while i < u.endIndex {
65-
if recognizer.hasCharacterBoundary(before: u[i]) {
58+
59+
var recognizer = Unicode._CharacterRecognizer()
60+
var actualBreaks: [String.Index] = []
61+
for i in u.indices {
62+
if recognizer.hasBreak(before: u[i]) {
6663
actualBreaks.append(i)
6764
}
68-
u.formIndex(after: &i)
6965
}
7066
expectEqual(actualBreaks, expectedBreaks,
7167
"""

0 commit comments

Comments
 (0)