Skip to content

Commit 1706d4c

Browse files
committed
[String] Refactor and fast-path normalization
Refactor some normalization queries into StringNormalization.swift, and add more latiny (<0x300) fast-paths.
1 parent 8d2af45 commit 1706d4c

File tree

2 files changed

+67
-52
lines changed

2 files changed

+67
-52
lines changed

stdlib/public/core/NormalizedCodeUnitIterator.swift

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,58 +16,6 @@ extension _Normalization {
1616
internal typealias _SegmentOutputBuffer = _FixedArray16<UInt16>
1717
}
1818

19-
extension Unicode.Scalar {
20-
// Normalization boundary - a place in a string where everything left of the
21-
// boundary can be normalized independently from everything right of the
22-
// boundary. The concatenation of each result is the same as if the entire
23-
// string had been normalized as a whole.
24-
//
25-
// Normalization segment - a sequence of code units between two normalization
26-
// boundaries (without any boundaries in the middle). Note that normalization
27-
// segments can, as a process of normalization, expand, contract, and even
28-
// produce new sub-segments.
29-
30-
// Whether this scalar value always has a normalization boundary before it.
31-
internal var _hasNormalizationBoundaryBefore: Bool {
32-
_internalInvariant(Int32(exactly: self.value) != nil, "top bit shouldn't be set")
33-
let value = Int32(bitPattern: self.value)
34-
return 0 != __swift_stdlib_unorm2_hasBoundaryBefore(
35-
_Normalization._nfcNormalizer, value)
36-
}
37-
internal var _isNFCQCYes: Bool {
38-
return __swift_stdlib_u_getIntPropertyValue(
39-
Builtin.reinterpretCast(value), __swift_stdlib_UCHAR_NFC_QUICK_CHECK
40-
) == 1
41-
}
42-
}
43-
44-
internal func _tryNormalize(
45-
_ input: UnsafeBufferPointer<UInt16>,
46-
into outputBuffer:
47-
UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>
48-
) -> Int? {
49-
return _tryNormalize(input, into: _castOutputBuffer(outputBuffer))
50-
}
51-
internal func _tryNormalize(
52-
_ input: UnsafeBufferPointer<UInt16>,
53-
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
54-
) -> Int? {
55-
var err = __swift_stdlib_U_ZERO_ERROR
56-
let count = __swift_stdlib_unorm2_normalize(
57-
_Normalization._nfcNormalizer,
58-
input.baseAddress._unsafelyUnwrappedUnchecked,
59-
numericCast(input.count),
60-
outputBuffer.baseAddress._unsafelyUnwrappedUnchecked,
61-
numericCast(outputBuffer.count),
62-
&err
63-
)
64-
guard err.isSuccess else {
65-
// The output buffer needs to grow
66-
return nil
67-
}
68-
return numericCast(count)
69-
}
70-
7119
//
7220
// Pointer casting helpers
7321
//

stdlib/public/core/StringNormalization.swift

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,70 @@ internal enum _Normalization {
3535
internal static let _maxNFCExpansionFactor = 3
3636
}
3737

38+
extension Unicode.Scalar {
39+
// Normalization boundary - a place in a string where everything left of the
40+
// boundary can be normalized independently from everything right of the
41+
// boundary. The concatenation of each result is the same as if the entire
42+
// string had been normalized as a whole.
43+
//
44+
// Normalization segment - a sequence of code units between two normalization
45+
// boundaries (without any boundaries in the middle). Note that normalization
46+
// segments can, as a process of normalization, expand, contract, and even
47+
// produce new sub-segments.
48+
49+
// Whether this scalar value always has a normalization boundary before it.
50+
@inline(__always) // common fast-path
51+
internal var _hasNormalizationBoundaryBefore: Bool {
52+
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
53+
// before them
54+
if self.value < 0x300 { return true }
55+
56+
_internalInvariant(Int32(exactly: self.value) != nil, "top bit shouldn't be set")
57+
let value = Int32(bitPattern: self.value)
58+
return 0 != __swift_stdlib_unorm2_hasBoundaryBefore(
59+
_Normalization._nfcNormalizer, value)
60+
}
61+
@inline(__always) // common fast-path
62+
internal var _isNFCQCYes: Bool {
63+
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
64+
// before them
65+
if self.value < 0x300 { return true }
66+
67+
return __swift_stdlib_u_getIntPropertyValue(
68+
Builtin.reinterpretCast(value), __swift_stdlib_UCHAR_NFC_QUICK_CHECK
69+
) == 1
70+
}
71+
72+
// Quick check if a scalar is NFC and a segment starter
73+
internal var _isNFCStarter: Bool {
74+
// Otherwise, consult the properties
75+
return self._hasNormalizationBoundaryBefore && self._isNFCQCYes
76+
}
77+
}
78+
79+
internal func _tryNormalize(
80+
_ input: UnsafeBufferPointer<UInt16>,
81+
into outputBuffer:
82+
UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>
83+
) -> Int? {
84+
return _tryNormalize(input, into: _castOutputBuffer(outputBuffer))
85+
}
86+
internal func _tryNormalize(
87+
_ input: UnsafeBufferPointer<UInt16>,
88+
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
89+
) -> Int? {
90+
var err = __swift_stdlib_U_ZERO_ERROR
91+
let count = __swift_stdlib_unorm2_normalize(
92+
_Normalization._nfcNormalizer,
93+
input.baseAddress._unsafelyUnwrappedUnchecked,
94+
numericCast(input.count),
95+
outputBuffer.baseAddress._unsafelyUnwrappedUnchecked,
96+
numericCast(outputBuffer.count),
97+
&err
98+
)
99+
guard err.isSuccess else {
100+
// The output buffer needs to grow
101+
return nil
102+
}
103+
return numericCast(count)
104+
}

0 commit comments

Comments
 (0)