Skip to content

Commit d10b3f8

Browse files
Optimization pass over String and UTF8Span's allASCII helper (#82540)
This ranges between parity (for very small strings) and 5x faster (for 32-63B strings) in benchmarking on M1 MBP. For largeish strings it delivers a roughly 2x speedup; further increase in blocksize nets a small win in microbenchmarks that I do not expect would translate to real world usage due to codesize impact and the fact that most strings are smallish. There's some opportunity for further work here; in particular, if people start building Swift for a baseline of AVX2 or AVX512, we should have paths for that (and we should also implement them if/when we get better multiversioning dispatch machinery in the language). Span adoption would be interesting. It's likely we should have a dedicated "small core" implementation that uses only aligned accesses. Still, this is a significant improvement as-is, and we should land it. ![allASCII](https://github.com/user-attachments/assets/ebbc45ba-5ba8-42dd-bf63-31ca77844fca)
1 parent 3739956 commit d10b3f8

File tree

1 file changed

+112
-46
lines changed

1 file changed

+112
-46
lines changed

stdlib/public/core/StringCreate.swift

Lines changed: 112 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,60 +13,126 @@
1313
//===----------------------------------------------------------------------===//
1414

1515
internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
16-
if input.isEmpty { return true }
17-
18-
// NOTE: Avoiding for-in syntax to avoid bounds checks
19-
//
20-
// TODO(String performance): SIMD-ize
21-
//
22-
let count = input.count
23-
var ptr = unsafe UnsafeRawPointer(input.baseAddress._unsafelyUnwrappedUnchecked)
24-
25-
let asciiMask64 = 0x8080_8080_8080_8080 as UInt64
26-
let asciiMask32 = UInt32(truncatingIfNeeded: asciiMask64)
27-
let asciiMask16 = UInt16(truncatingIfNeeded: asciiMask64)
28-
let asciiMask8 = UInt8(truncatingIfNeeded: asciiMask64)
29-
30-
let end128 = unsafe ptr + count & ~(MemoryLayout<(UInt64, UInt64)>.stride &- 1)
31-
let end64 = unsafe ptr + count & ~(MemoryLayout<UInt64>.stride &- 1)
32-
let end32 = unsafe ptr + count & ~(MemoryLayout<UInt32>.stride &- 1)
33-
let end16 = unsafe ptr + count & ~(MemoryLayout<UInt16>.stride &- 1)
34-
let end = unsafe ptr + count
35-
36-
37-
while unsafe ptr < end128 {
38-
let pair = unsafe ptr.loadUnaligned(as: (UInt64, UInt64).self)
39-
let result = (pair.0 | pair.1) & asciiMask64
40-
guard result == 0 else { return false }
41-
unsafe ptr = unsafe ptr + MemoryLayout<(UInt64, UInt64)>.stride
16+
//--------------- Implementation building blocks ---------------------------//
17+
#if arch(arm64_32)
18+
typealias Word = UInt64
19+
#else
20+
typealias Word = UInt
21+
#endif
22+
let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64)
23+
24+
#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
25+
// TODO: Should consider AVX2 / AVX512 / AVX10 path here
26+
typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
27+
@_transparent func pmovmskb(_ vec: SIMD16<UInt8>) -> UInt16 {
28+
UInt16(Builtin.bitcast_Vec16xInt1_Int16(
29+
Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer())
30+
))
31+
}
32+
#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
33+
typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
34+
@_transparent func umaxv(_ vec: SIMD16<UInt8>) -> UInt8 {
35+
UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value))
4236
}
37+
#else
38+
typealias Block = (Word, Word, Word, Word)
39+
#endif
4340

44-
// If we had enough bytes for two iterations of this, we would have hit
45-
// the loop above, so we only need to do this once
46-
if unsafe ptr < end64 {
47-
let value = unsafe ptr.loadUnaligned(as: UInt64.self)
48-
guard value & asciiMask64 == 0 else { return false }
49-
unsafe ptr = unsafe ptr + MemoryLayout<UInt64>.stride
41+
@_transparent
42+
func allASCII(wordAt pointer: UnsafePointer<UInt8>) -> Bool {
43+
let word = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Word.self)
44+
return word & mask == 0
5045
}
5146

52-
if unsafe ptr < end32 {
53-
let value = unsafe ptr.loadUnaligned(as: UInt32.self)
54-
guard value & asciiMask32 == 0 else { return false }
55-
unsafe ptr = unsafe ptr + MemoryLayout<UInt32>.stride
47+
@_transparent
48+
func allASCII(blockAt pointer: UnsafePointer<UInt8>) -> Bool {
49+
let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
50+
#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
51+
return pmovmskb(block.0 | block.1) == 0
52+
#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
53+
return umaxv(block.0 | block.1) < 0x80
54+
#else
55+
return (block.0 | block.1 | block.2 | block.3) & mask == 0
56+
#endif
57+
}
58+
//----------------------- Implementation proper ----------------------------//
59+
guard input.count >= MemoryLayout<Word>.size else {
60+
// They gave us a region of memory
61+
// whose size is as modest as it can be.
62+
// We'll check every byte
63+
// for the bit of most height
64+
// and return if we happen on any
65+
//
66+
// I'm sorry, I'm sorry, I'm trying to delete it. (This chunk of code, not
67+
// the Limerick. I would wager that--at least for Strings--we could
68+
// unconditionally load 16B here,¹ because of the small string encoding,
69+
// and check them all at once, which would be much more efficient. That
70+
// probably has to happen by lifting this check into the SmallString
71+
// initializer directly, though.)
72+
//
73+
// ¹ well, most of the time, which makes it a rather conditional
74+
// "unconditionally".
75+
return unsafe input.allSatisfy { $0 < 0x80 }
5676
}
5777

58-
if unsafe ptr < end16 {
59-
let value = unsafe ptr.loadUnaligned(as: UInt16.self)
60-
guard value & asciiMask16 == 0 else { return false }
61-
unsafe ptr = unsafe ptr + MemoryLayout<UInt16>.stride
78+
// bytes.count is non-zero, so we can unconditionally unwrap baseAddress.
79+
let base = unsafe input.baseAddress._unsafelyUnwrappedUnchecked
80+
let n = input.count
81+
var i = 0
82+
83+
guard n >= MemoryLayout<Block>.size else {
84+
// The size isn't yet to a block
85+
// word-by-word we are forced to walk.
86+
// So as to not leave a gap
87+
// the last word may lap
88+
// the word that we already chalked.
89+
//
90+
// 0 k 2k 3k ?k n-k n-1
91+
// | | | | | | |
92+
// +------+------+------+ +------+ |
93+
// | word | word | word | ... | word | |
94+
// +------+------+------+ +------+ v
95+
// +------+
96+
// possibly overlapping final word > | word |
97+
// +------+
98+
//
99+
// This means that we check any bytes in the overlap region twice, but
100+
// that's much preferrable to using smaller accesses to avoid rechecking,
101+
// because the entire last word is about as expensive as checking just
102+
// one byte would be, and on average there's more than one byte remaining.
103+
//
104+
// Note that we don't bother trying to align any of these accesses, because
105+
// there is minimal benefit to doing so on "modern" OoO cores, which can
106+
// handle cacheline-crossing loads at full speed. If the string happens to
107+
// be aligned, they'll be aligned, if not, they won't be. It will likely
108+
// make sense to add a path that does align everything for more limited
109+
// embedded CPUs, though.
110+
let k = MemoryLayout<Word>.size
111+
let last = n &- k
112+
while i < last {
113+
guard unsafe allASCII(wordAt: base + i) else { return false }
114+
i &+= k
115+
}
116+
return unsafe allASCII(wordAt: base + last)
62117
}
63-
64-
if unsafe ptr < end {
65-
let value = unsafe ptr.loadUnaligned(fromByteOffset: 0, as: UInt8.self)
66-
guard value & asciiMask8 == 0 else { return false }
118+
119+
// check block-by-block, with a possibly overlapping last block to avoid
120+
// sub-block cleanup. We should be able to avoid manual index arithmetic
121+
// and write this loop and the one above something like the following:
122+
//
123+
// return stride(from: 0, to: last, by: k).allSatisfy {
124+
// allASCII(blockAt: base + $0)
125+
// } && allASCII(blockAt: base + last)
126+
//
127+
// but LLVM leaves one unnecessary conditional operation in the loop
128+
// when we do that, so we write them out as while loops instead for now.
129+
let k = MemoryLayout<Block>.size
130+
let last = n &- k
131+
while i < last {
132+
guard unsafe allASCII(blockAt: base + i) else { return false }
133+
i &+= k
67134
}
68-
unsafe _internalInvariant(ptr == end || ptr + 1 == end)
69-
return true
135+
return unsafe allASCII(blockAt: base + last)
70136
}
71137

72138
extension String {

0 commit comments

Comments
 (0)