Skip to content

Commit 699a3f0

Browse files
committed
[stdlib] Add new SPI for grapheme breaking (outside String)
`Unicode._CharacterRecognizer` is a newly exported opaque type that exposes the stdlib’s extended grapheme cluster breaking facility, independent of `String`. This essentially makes the underlying simple state machine public, without exposing any of the (unstable) Unicode details. The ability to perform grapheme breaking over, say, the scalars stored in multiple `String` values can be extremely useful while building custom text processing algorithms and data structures. Ideally this would eventually become API, but before proposing this to Swift Evolution, I’d like to prove the shape of the type in actual use (and we’ll also need to find better names for its operations).
1 parent 6e5097a commit 699a3f0

File tree

2 files changed

+140
-0
lines changed

2 files changed

+140
-0
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,70 @@ internal struct _GraphemeBreakingState {
436436
var shouldBreakRI = false
437437
}
438438

439+
extension Unicode {
440+
/// A state machine for recognizing `Character` (i.e., extended grapheme
441+
/// cluster) boundaries in an arbitrary series of Unicode scalars.
442+
///
443+
/// The recognizer needs to be initialized with the first scalar in the
444+
/// series. Subsequent scalars must then be fed one by one to the
445+
/// `hasCharacterBoundary(before:)` method, which returns a Boolean value
446+
/// indicating whether the given scalar starts a new `Character`.
447+
///
448+
/// The results produced by this state machine are guaranteed to match the way
449+
/// `String` splits its contents into `Character` values.
450+
@available(SwiftStdlib 5.8, *)
451+
public // SPI(Foundation) FIXME: We need API for this
452+
struct _CharacterRecognizer {
453+
internal var _previous: Unicode.Scalar
454+
internal var _state: _GraphemeBreakingState
455+
456+
/// Returns a non-nil value if it can be determined whether there is a
457+
/// `Character` break between `scalar1` and `scalar2` without knowing
458+
/// anything about the scalars that precede `scalar1`. This can be used as a
459+
/// fast (but incomplete) test before spinning up a full state machine
460+
/// session.
461+
@_effects(releasenone)
462+
public static func quickBreak(
463+
between scalar1: Unicode.Scalar,
464+
and scalar2: Unicode.Scalar
465+
) -> Bool? {
466+
if scalar1.value == 0xD, scalar2.value == 0xA {
467+
return false
468+
}
469+
if _hasGraphemeBreakBetween(scalar1, scalar2) {
470+
return true
471+
}
472+
return nil
473+
}
474+
475+
/// Initialize a new `Character` recognizer, feeding it the given value as
476+
/// the first Unicode scalar in the series. The state machine assumes that
477+
/// `first` is supposed to start a new extended grapheme cluster.
478+
public init(first: Unicode.Scalar) {
479+
_state = _GraphemeBreakingState()
480+
_previous = first
481+
}
482+
483+
/// Feeds the next scalar to the state machine, returning a Boolean value
484+
/// indicating whether it starts a new `Character`.
485+
///
486+
/// The state machine does not carry information across `Character`
487+
/// boundaries. I.e., if this method returns true, then `self` after the
488+
/// call is equivalent to `_CharacterRecognizer(first: next)`.
489+
@_effects(releasenone)
490+
public mutating func hasCharacterBoundary(
491+
before next: Unicode.Scalar
492+
) -> Bool {
493+
let r = _state.shouldBreak(between: _previous, and: next)
494+
if r {
495+
_state = _GraphemeBreakingState()
496+
}
497+
_previous = next
498+
return r
499+
}
500+
}
501+
}
502+
439503
extension _StringGuts {
440504
// Returns the stride of the grapheme cluster starting at offset `index`,
441505
// assuming it is on a grapheme cluster boundary.

test/stdlib/CharacterRecognizer.swift

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// RUN: %empty-directory(%t)
2+
// RUN: %target-run-stdlib-swift %S/Inputs/
3+
4+
// REQUIRES: executable_test
5+
// REQUIRES: objc_interop
6+
// REQUIRES: optimized_stdlib
7+
8+
import Swift
9+
import StdlibUnittest
10+
import StdlibUnicodeUnittest
11+
12+
var suite = TestSuite("CharacterRecognizer")
13+
defer { runAllTests() }
14+
15+
if #available(SwiftStdlib 5.8, *) {
16+
suite.test("Unicode test data") {
17+
for test in graphemeBreakTests {
18+
var it = test.string.unicodeScalars.makeIterator()
19+
guard let first = it.next() else { continue }
20+
var recognizer = Unicode._CharacterRecognizer(first: first)
21+
var pieces: [[Unicode.Scalar]] = []
22+
var piece: [Unicode.Scalar] = [first]
23+
while let next = it.next() {
24+
if recognizer.hasCharacterBoundary(before: next) {
25+
pieces.append(piece)
26+
piece = [next]
27+
} else {
28+
piece.append(next)
29+
}
30+
}
31+
if !piece.isEmpty { pieces.append(piece) }
32+
expectEqual(pieces, test.pieces,
33+
"string: \(String(reflecting: test.string))")
34+
}
35+
}
36+
}
37+
38+
if #available(SwiftStdlib 5.8, *) {
39+
suite.test("Consistency with Swift String's behavior") {
40+
let sampleString = #"""
41+
The powerful programming language that is also easy to learn.
42+
손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
43+
🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
44+
some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d
45+
Unicode is such fun!
46+
U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵
47+
U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚
48+
U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\#
49+
n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\#
50+
ḭ̸̦̺̺͉̳͎́͑\#
51+
c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\#
52+
ö̶̱̠̱̤̙͚͖̳̜̰̹̖̣̻͎͉̞̫̬̯͕̝͔̝̟̘͔̙̪̭̲́̆̂͑̌͂̉̀̓́̏̎̋͗͛͆̌̽͌̄̎̚͝͝͝͝ͅ\#
53+
d̶̨̨̡̡͙̟͉̱̗̝͙͍̮͍̘̮͔͑\#
54+
e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ
55+
"""#
56+
57+
let expectedBreaks = Array(sampleString.indices)
58+
59+
let u = sampleString.unicodeScalars
60+
var i = u.startIndex
61+
var actualBreaks = [i]
62+
var recognizer = Unicode._CharacterRecognizer(first: u[i])
63+
u.formIndex(after: &i)
64+
while i < u.endIndex {
65+
if recognizer.hasCharacterBoundary(before: u[i]) {
66+
actualBreaks.append(i)
67+
}
68+
u.formIndex(after: &i)
69+
}
70+
expectEqual(actualBreaks, expectedBreaks,
71+
"""
72+
actualBreaks: \(actualBreaks.map { $0._description })
73+
expectedBreaks: \(expectedBreaks.map { $0._description })
74+
""")
75+
}
76+
}

0 commit comments

Comments
 (0)