Skip to content

Commit 83f983e

Browse files
committed
[stdlib] _CharacterRecognizer._firstBreak(inUncheckedUnsafeUTF8Buffer:startingAt:)
1 parent 9922b00 commit 83f983e

File tree

2 files changed

+84
-2
lines changed

2 files changed

+84
-2
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,44 @@ extension Unicode {
506506
_previous = next
507507
return r
508508
}
509+
510+
/// Decode the scalars in the given UTF-8 buffer and feed them to the
511+
/// recognizer up to and including the scalar following the first grapheme
512+
/// break. If the buffer contains a grapheme break, then this function
513+
/// returns the index range of the scalar that follows the first one;
514+
/// otherwise it returns `nil`.
515+
///
516+
/// On return, the state of the recognizer is updated to reflect the scalars
517+
/// up to and including the returned one. You can detect additional grapheme
518+
/// breaks by feeding the recognizer subsequent data.
519+
///
520+
/// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
521+
/// ending on Unicode scalar boundaries.
522+
///
523+
/// - Parameter start: A valid index into `buffer`, addressing the first
524+
/// code unit of a UTF-8 scalar in the buffer, or the end.
525+
///
526+
/// - Returns: The index range of the scalar that follows the first grapheme
527+
/// break in the buffer, if there is one. If the buffer contains no
528+
/// grapheme breaks, then this function returns `nil`.
529+
///
530+
/// - Warning: This function does not validate that the buffer contains
531+
/// valid UTF-8 data; its behavior is undefined if given invalid input.
532+
@_effects(releasenone)
533+
public mutating func _firstBreak(
534+
inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer<UInt8>,
535+
startingAt start: Int = 0
536+
) -> Range<Int>? {
537+
var i = start
538+
while i < buffer.endIndex {
539+
let (next, n) = _decodeScalar(buffer, startingAt: i)
540+
if hasBreak(before: next) {
541+
return Range(_uncheckedBounds: (i, i &+ n))
542+
}
543+
i &+= n
544+
}
545+
return nil
546+
}
509547
}
510548
}
511549

test/stdlib/CharacterRecognizer.swift

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,50 @@ if #available(SwiftStdlib 5.8, *) {
3333
}
3434
}
3535

36+
func scalars(in buffer: some Sequence<UInt8>) -> [Unicode.Scalar] {
37+
var result: [Unicode.Scalar] = []
38+
var it = buffer.makeIterator()
39+
var utf8Decoder = UTF8()
40+
while true {
41+
switch utf8Decoder.decode(&it) {
42+
case .scalarValue(let v): result.append(v)
43+
case .emptyInput: return result
44+
case .error: expectTrue(false, "Invalid scalar")
45+
}
46+
}
47+
}
48+
3649
if #available(SwiftStdlib 5.8, *) {
37-
suite.test("Consistency with Swift String's behavior") {
38-
let sampleString = #"""
50+
suite.test("Unicode test data/_firstBreak") {
51+
for test in graphemeBreakTests {
52+
var recognizer = Unicode._CharacterRecognizer()
53+
var pieces: [[Unicode.Scalar]] = []
54+
var str = test.string
55+
str.withUTF8 { buffer in
56+
var i = buffer.startIndex
57+
var last = i
58+
while i < buffer.endIndex {
59+
guard let scalar = recognizer._firstBreak(
60+
inUncheckedUnsafeUTF8Buffer: buffer, startingAt: i)
61+
else { break }
62+
63+
if scalar.lowerBound > last {
64+
pieces.append(scalars(in: buffer[last..<scalar.lowerBound]))
65+
}
66+
67+
last = scalar.lowerBound
68+
i = scalar.upperBound
69+
}
70+
71+
pieces.append(scalars(in: buffer[last...]))
72+
}
73+
expectEqual(pieces, test.pieces,
74+
"string: \(String(reflecting: test.string))")
75+
}
76+
}
77+
}
78+
79+
let sampleString = #"""
3980
The powerful programming language that is also easy to learn.
4081
손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
4182
🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
@@ -52,6 +93,9 @@ if #available(SwiftStdlib 5.8, *) {
5293
e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ
5394
"""#
5495

96+
if #available(SwiftStdlib 5.8, *) {
97+
suite.test("Consistency with Swift String's behavior/hasBreak") {
98+
5599
let expectedBreaks = Array(sampleString.indices)
56100

57101
let u = sampleString.unicodeScalars

0 commit comments

Comments
 (0)