Skip to content

Commit 98aaa15

Browse files
committed
Implement native normalization for String
use >/< instead of != fix some bugs fix
1 parent 3664eca commit 98aaa15

10 files changed

+611
-709
lines changed

stdlib/public/core/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ set(SWIFTLIB_ESSENTIAL
101101
NativeDictionary.swift
102102
NativeSet.swift
103103
NewtypeWrapper.swift
104+
NFC.swift
105+
NFD.swift
104106
ObjectIdentifier.swift
105107
Optional.swift
106108
OptionSet.swift

stdlib/public/core/GroupInfo.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
"Character.swift",
1010
"CharacterProperties.swift",
1111
"ICU.swift",
12+
"NFC.swift",
13+
"NFD.swift",
1214
"NormalizedCodeUnitIterator.swift",
1315
"SmallString.swift",
1416
"StaticString.swift",

stdlib/public/core/NFC.swift

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import SwiftShims
14+
15+
extension Unicode {
16+
internal struct NFC<S: StringProtocol> {
17+
let base: S
18+
}
19+
}
20+
21+
extension Unicode.NFC {
22+
internal struct Iterator {
23+
var buffer: [(scalar: Unicode.Scalar, normData: UInt16)] = []
24+
25+
var composee: Unicode.Scalar? = nil
26+
27+
var hasBeenReversed = false
28+
29+
var iterator: Unicode.NFD<S>.Iterator
30+
}
31+
}
32+
33+
extension Unicode.NFC.Iterator: IteratorProtocol {
34+
internal func compose(
35+
_ x: Unicode.Scalar,
36+
and y: Unicode.Scalar
37+
) -> Unicode.Scalar? {
38+
// Fast path: ASCII and some latiny scalars never compose when they're on
39+
// the rhs.
40+
if y.value < 0x300 {
41+
return nil
42+
}
43+
44+
switch (x.value, y.value) {
45+
// Check for Hangul (L, V) -> LV compositions.
46+
case (0x1100 ... 0x1112, 0x1161 ... 0x1175):
47+
let lIdx = x.value &- 0x1100
48+
let vIdx = y.value &- 0x1161
49+
let lvIdx = lIdx &* 588 &+ vIdx &* 28
50+
let s = 0xAC00 &+ lvIdx
51+
return Unicode.Scalar(_value: s)
52+
53+
// Check for Hangul (LV, T) -> LVT compositions.
54+
case (0xAC00 ... 0xD7A3, 0x11A7 &+ 1 ... 0x11C2):
55+
if (x.value &- 0xAC00) % 28 == 0 {
56+
return Unicode.Scalar(_value: x.value &+ y.value &- 0x11A7)
57+
} else {
58+
fallthrough
59+
}
60+
61+
// Otherwise, look it up.
62+
default:
63+
let composition = _swift_stdlib_getComposition(x.value, y.value)
64+
65+
guard composition != .max else {
66+
return nil
67+
}
68+
69+
return Unicode.Scalar(_value: composition)
70+
}
71+
}
72+
73+
internal mutating func next() -> Unicode.Scalar? {
74+
// Empty out our buffer before attempting to compose anything with our new
75+
// composee.
76+
if !buffer.isEmpty {
77+
if !hasBeenReversed {
78+
buffer.reverse()
79+
hasBeenReversed = true
80+
}
81+
82+
return buffer.removeLast().scalar
83+
}
84+
85+
hasBeenReversed = false
86+
87+
while let current = iterator.next() {
88+
let currentCCC = current.normData >> 3
89+
let currentIsNFCQC = current.normData & 0x6 == 0
90+
91+
guard let l = composee else {
92+
// If we don't have a composee at this point, we're most likely looking
93+
// at the start of a string. If our class is 0, then attempt to compose
94+
// the following scalars with this one. Otherwise, it's a one off scalar
95+
// that needs to be emitted.
96+
if currentCCC == 0 {
97+
composee = current.scalar
98+
continue
99+
} else {
100+
return current.scalar
101+
}
102+
}
103+
104+
// Check if we have any scalars within the buffer, and if so get the last
105+
// scalar's normalization data.
106+
guard let lastNormData = buffer.last?.normData else {
107+
// If we do not any have scalars in our buffer yet, then this step is
108+
// trivial. Attempt to compose our current scalar with whatever composee
109+
// we're currently building up.
110+
111+
// If our right hand side scalar IS NFC_QC, then that means it can
112+
// never compose with any scalars previous to it. So, if our current
113+
// scalar is NFC_QC, then we have no composition.
114+
guard !currentIsNFCQC, let p = compose(l, and: current.scalar) else {
115+
// We did not find a composition between the two. If our current class
116+
// is 0, then set that as the new composee and return whatever built
117+
// up scalar we have. Otherwise, add our current scalar to the buffer
118+
// for eventually removal!
119+
120+
guard currentCCC == 0 else {
121+
buffer.append(current)
122+
continue
123+
}
124+
125+
composee = current.scalar
126+
return l
127+
}
128+
129+
// We found a composition! Record it as our new composee and repeat the
130+
// process.
131+
composee = p
132+
continue
133+
}
134+
135+
// We only care about the last's ccc.
136+
let lastCCC = lastNormData >> 3
137+
138+
// Check if our current scalar is not blocked from our current composee.
139+
// In this case blocked means there is some scalar whose class (lastClass)
140+
// is either == 0 or >= currentClass.
141+
//
142+
// Example:
143+
//
144+
// "z\u{0335}\u{0327}\u{0324}\u{0301}"
145+
//
146+
// In this example, there are several combining marks following a 'z', but
147+
// none of them actually compose with the composee 'z'. However, the last
148+
// scalar U+0301 does actually compose. So this check makes sure that the
149+
// last scalar doesn't have any scalar in between it and the composee that
150+
// would otherwise "block" it from composing.
151+
guard lastCCC < currentCCC else {
152+
// We had a scalar block it. That means our current scalar is either a
153+
// starter or has a same class (preserve ordering).
154+
155+
guard currentCCC == 0 else {
156+
// Not a starter, stick it at the end of the buffer and keep going!
157+
158+
buffer.append(current)
159+
continue
160+
}
161+
162+
// Starters are the "start" of a new normalization segment. Set it as
163+
// the new composee and return our current composee. This will trigger
164+
// any other scalars in the buffer to be emitted before we handle
165+
// composing this new composee.
166+
composee = current.scalar
167+
return l
168+
}
169+
170+
// There were no blockers! Attempt to compose the two! (Again, if our rhs
171+
// scalar IS NFC_QC, then it can never compose with anything previous to
172+
// it).
173+
guard !currentIsNFCQC, let p = compose(l, and: current.scalar) else {
174+
// No composition found. Stick it at the end of the buffer with the rest
175+
// of non-composed scalars.
176+
177+
buffer.append(current)
178+
continue
179+
}
180+
181+
// They composed! Assign the composition as our new composee and iterate
182+
// to the next scalar.
183+
composee = p
184+
}
185+
186+
// If we have a leftover composee, make sure to return it.
187+
return composee.take()
188+
}
189+
}
190+
191+
extension Unicode.NFC: Sequence {
192+
internal func makeIterator() -> Iterator {
193+
Iterator(iterator: base.nfd.makeIterator())
194+
}
195+
}
196+
197+
extension StringProtocol {
198+
internal var nfc: Unicode.NFC<Self> {
199+
Unicode.NFC(base: self)
200+
}
201+
}

0 commit comments

Comments
 (0)