Skip to content

Commit 753a940

Browse files
authored
ASCII fast paths for grapheme stride (#72064)
1 parent 036a13d commit 753a940

File tree

2 files changed

+61
-15
lines changed

2 files changed

+61
-15
lines changed

stdlib/public/core/StringCharacterView.swift

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ extension String: BidirectionalCollection {
249249
}
250250
return i
251251
}
252-
252+
253253
/// Returns the distance between two indices.
254254
///
255255
/// - Parameters:
@@ -269,23 +269,25 @@ extension String: BidirectionalCollection {
269269
let start = _guts.validateInclusiveCharacterIndex_5_7(start)
270270
let end = _guts.validateInclusiveCharacterIndex_5_7(end)
271271

272-
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
273-
274272
// Per SE-0180, `start` and `end` are allowed to fall in between Character
275273
// boundaries, in which case this function must still terminate without
276274
// trapping and return a result that makes sense.
277-
278-
var i = start
275+
var i = start._encodedOffset
279276
var count = 0
280-
if i < end {
281-
while i < end { // Note `<` instead of `==`
282-
count += 1
283-
i = _uncheckedIndex(after: i)
277+
if start < end {
278+
while i < end._encodedOffset { // Note `<` instead of `==`
279+
count &+= 1
280+
/*
281+
For the purposes of this loop, this should be equivalent to
282+
_uncheckedIndex(after: i). We don't need to spend time setting up
283+
actual Indexes when we only care about counting strides.
284+
*/
285+
i &+= _guts._opaqueCharacterStride(startingAt: i)
284286
}
285-
} else if i > end {
286-
while i > end { // Note `<` instead of `==`
287-
count -= 1
288-
i = _uncheckedIndex(before: i)
287+
} else if start > end {
288+
while i > end._encodedOffset { // Note `<` instead of `==`
289+
count &-= 1
290+
i &-= _guts._opaqueCharacterStride(endingAt: i)
289291
}
290292
}
291293
return count

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,30 @@ extension _StringGuts {
196196
/// inconsistent with `_opaqueCharacterStride(endingAt:)`. On the other hand,
197197
/// this behavior makes this suitable for use in substrings whose start index
198198
/// itself does not fall on a cluster boundary.
199-
@usableFromInline @inline(never)
199+
@usableFromInline @inline(__always)
200200
@_effects(releasenone)
201201
internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
202+
_internalInvariant(i < endIndex._encodedOffset)
203+
if isFastUTF8 {
204+
let fast = withFastUTF8 { utf8 in
205+
if i &+ 1 == utf8.count { return true }
206+
let pair = UnsafeRawPointer(
207+
utf8.baseAddress.unsafelyUnwrapped
208+
).loadUnaligned(fromByteOffset: i, as: UInt16.self)
209+
//& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF"
210+
return pair & 0x8080 == 0 && pair != 0x0A0D
211+
}
212+
if _fastPath(fast) {
213+
_internalInvariant(_opaqueComplexCharacterStride(startingAt: i) == 1)
214+
return 1
215+
}
216+
}
217+
218+
return _opaqueComplexCharacterStride(startingAt: i)
219+
}
220+
221+
@_effects(releasenone) @inline(never)
222+
internal func _opaqueComplexCharacterStride(startingAt i: Int) -> Int {
202223
if _slowPath(isForeign) {
203224
return _foreignOpaqueCharacterStride(startingAt: i)
204225
}
@@ -221,9 +242,32 @@ extension _StringGuts {
221242
///
222243
/// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always
223244
/// finds a correct grapheme cluster boundary.
224-
@usableFromInline @inline(never)
245+
246+
@usableFromInline @inline(__always)
225247
@_effects(releasenone)
226248
internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
249+
if i <= 1 {
250+
return i
251+
}
252+
if isFastUTF8 {
253+
let fast = withFastUTF8 { utf8 in
254+
let pair = UnsafeRawPointer(
255+
utf8.baseAddress.unsafelyUnwrapped
256+
).loadUnaligned(fromByteOffset: i &- 2, as: UInt16.self)
257+
//& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF"
258+
return pair & 0x8080 == 0 && pair != 0x0A0D
259+
}
260+
if _fastPath(fast) {
261+
_internalInvariant(_opaqueComplexCharacterStride(endingAt: i) == 1)
262+
return 1
263+
}
264+
}
265+
266+
return _opaqueComplexCharacterStride(endingAt: i)
267+
}
268+
269+
@_effects(releasenone) @inline(never)
270+
internal func _opaqueComplexCharacterStride(endingAt i: Int) -> Int {
227271
if _slowPath(isForeign) {
228272
return _foreignOpaqueCharacterStride(endingAt: i)
229273
}

0 commit comments

Comments
 (0)