Skip to content

Commit 9b25a2d

Browse files
committed
Add a Unicode normalization data generator
1 parent 25f7145 commit 9b25a2d

File tree

12 files changed

+44478
-9
lines changed

12 files changed

+44478
-9
lines changed

utils/gen-unicode-data/Data/DerivedNormalizationProps.txt

Lines changed: 9829 additions & 0 deletions
Large diffs are not rendered by default.

utils/gen-unicode-data/Data/UnicodeData.txt

Lines changed: 33797 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
// This was auto-generated by utils/gen-unicode-data/GenNormalization,
14+
// please do not edit this file yourself!
15+
16+
#include "../SwiftShims/UnicodeData.h"
17+
#include <limits>
18+

utils/gen-unicode-data/Package.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ let package = Package(
99
.target(
1010
name: "GenUtils",
1111
dependencies: []
12+
),
13+
.executableTarget(
14+
name: "GenNormalization",
15+
dependencies: ["GenUtils"]
1216
)
1317
]
1418
)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import GenUtils
14+
15+
// Given a string to the UnicodeData file, return the flattened list of scalar
16+
// to Canonical Combining Class.
17+
//
18+
// Each line in this data file is formatted like the following:
19+
//
20+
// 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
21+
//
22+
// Where each section is split by a ';'. The first section informs us of the
23+
// scalar in the line with the various properties. For the purposes of CCC data,
24+
// we only need the 0 in between the Cc and BN (index 3) which is the raw value
25+
// for the CCC.
26+
func getCCCData(from data: String, with dict: inout [UInt32: UInt16]) {
27+
for line in data.split(separator: "\n") {
28+
let components = line.split(separator: ";", omittingEmptySubsequences: false)
29+
30+
let ccc = UInt16(components[3])!
31+
32+
// For the most part, CCC 0 is the default case, so we can save much more
33+
// space by not keeping this information and making it the fallback case.
34+
if ccc == 0 {
35+
continue
36+
}
37+
38+
let scalarStr = components[0]
39+
let scalar = UInt32(scalarStr, radix: 16)!
40+
41+
var newValue = dict[scalar, default: 0]
42+
43+
// Store our ccc past the 3rd bit.
44+
newValue |= ccc << 3
45+
46+
dict[scalar] = newValue
47+
}
48+
}
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import GenUtils
14+
15+
func getCompExclusions(from data: String) -> [ClosedRange<UInt32>] {
16+
var result: [ClosedRange<UInt32>] = []
17+
18+
for line in data.split(separator: "\n") {
19+
// Skip comments
20+
guard !line.hasPrefix("#") else {
21+
continue
22+
}
23+
24+
let info = line.split(separator: "#")
25+
let components = info[0].split(separator: ";")
26+
27+
// Get the property first because we only care about Full_Composition_Exclusion
28+
let filteredProperty = components[1].filter { !$0.isWhitespace }
29+
30+
guard filteredProperty == "Full_Composition_Exclusion" else {
31+
continue
32+
}
33+
34+
let scalars: ClosedRange<UInt32>
35+
36+
let filteredScalars = components[0].filter { !$0.isWhitespace }
37+
38+
// If we have . appear, it means we have a legitimate range. Otherwise,
39+
// it's a singular scalar.
40+
if filteredScalars.contains(".") {
41+
let range = filteredScalars.split(separator: ".")
42+
43+
scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)!
44+
} else {
45+
let scalar = UInt32(filteredScalars, radix: 16)!
46+
47+
scalars = scalar ... scalar
48+
}
49+
50+
result.append(scalars)
51+
}
52+
53+
return result
54+
}
55+
56+
func emitComp(_ mph: Mph, _ data: [(UInt32, [UInt32])], into result: inout String) {
57+
emitMph(mph, name: "_swift_stdlib_nfc_comp", into: &result)
58+
emitCompComps(mph, data, into: &result)
59+
}
60+
61+
func emitCompComps(
62+
_ mph: Mph,
63+
_ data: [(UInt32, [UInt32])],
64+
into result: inout String
65+
) {
66+
let uniqueKeys = Set(data.map { $0.1[1] })
67+
var sortedData: [[(UInt32, UInt32)]] = .init(
68+
repeating: [],
69+
count: uniqueKeys.count
70+
)
71+
72+
for (scalar, comp) in data {
73+
let idx = mph.index(for: UInt64(comp[1]))
74+
75+
if sortedData[idx].isEmpty {
76+
sortedData[idx].append((comp[1], .max))
77+
}
78+
79+
sortedData[idx].append((comp[0], scalar))
80+
}
81+
82+
// Go back and sort each array as well as putting the size information of each
83+
// in the first element (who contains the original y scalar that was hashed).
84+
for i in sortedData.indices {
85+
sortedData[i][1...].sort { $0.0 < $1.0 }
86+
87+
sortedData[i][0].0 = sortedData[i][0].0 | UInt32(sortedData[i].count << 21)
88+
}
89+
90+
for i in sortedData.indices {
91+
result += """
92+
static const __swift_uint32_t _swift_stdlib_nfc_comp\(i)[\(sortedData[i].count)] = {
93+
94+
"""
95+
96+
formatCollection(sortedData[i], into: &result) { (comp, scalar) in
97+
// This only occurs for the first element who stores the original y scalar
98+
// in the composition and the size of the array.
99+
if scalar == .max {
100+
return "0x\(String(comp, radix: 16, uppercase: true))"
101+
}
102+
103+
// Make sure that these scalars don't exceed past 17 bits. We need the other
104+
// 15 bits to store the range to the final composition. Although Unicode
105+
// scalars can go up to 21 bits, none of the compositions with this scalar
106+
// go that high.
107+
assert(comp <= 0x1FFFF)
108+
var value = comp
109+
110+
// Calculate the distance from our current composition scalar to our final
111+
// composed scalar.
112+
let distance = Int(scalar) - Int(comp)
113+
// Make sure that our distance doesn't exceed 14 bits. Although the above
114+
// assertion gives us 15 bits, we use the last bit to indicate negative
115+
// or not.
116+
assert(distance <= 0x3FFF)
117+
118+
value |= UInt32(distance.magnitude) << 17
119+
120+
if distance < 0 {
121+
value |= 1 << 31
122+
}
123+
124+
return "0x\(String(value, radix: 16, uppercase: true))"
125+
}
126+
127+
result += "\n};\n\n"
128+
}
129+
130+
result += """
131+
static const __swift_uint32_t * const _swift_stdlib_nfc_comp_indices[\(sortedData.count)] = {
132+
133+
"""
134+
135+
formatCollection(sortedData.indices, into: &result) { i in
136+
return "_swift_stdlib_nfc_comp\(i)"
137+
}
138+
139+
result += "\n};\n\n"
140+
}
141+
142+
func emitCompAccessor(_ mph: Mph, into result: inout String) {
143+
result += """
144+
SWIFT_RUNTIME_STDLIB_INTERNAL
145+
__swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
146+
__swift_uint32_t y) {
147+
__swift_intptr_t compIdx = _swift_stdlib_getMphIdx(y, \(mph.bitArrays.count),
148+
_swift_stdlib_nfc_comp_keys,
149+
_swift_stdlib_nfc_comp_ranks,
150+
_swift_stdlib_nfc_comp_sizes);
151+
auto array = _swift_stdlib_nfc_comp_indices[compIdx];
152+
153+
// Ensure that the first element in this array is equal to our y scalar.
154+
auto realY = (array[0] << 11) >> 11;
155+
156+
if (y != realY) {
157+
return std::numeric_limits<__swift_uint32_t>::max();
158+
}
159+
160+
auto count = array[0] >> 21;
161+
162+
__swift_uint32_t low = 1;
163+
__swift_uint32_t high = count - 1;
164+
165+
while (high >= low) {
166+
auto idx = low + (high - low) / 2;
167+
168+
auto entry = array[idx];
169+
170+
// Shift the range count out of the scalar.
171+
auto lower = (entry << 15) >> 15;
172+
173+
bool isNegative = entry >> 31;
174+
auto rangeCount = (entry << 1) >> 18;
175+
176+
if (isNegative) {
177+
rangeCount = -rangeCount;
178+
}
179+
180+
auto composed = lower + rangeCount;
181+
182+
if (x == lower) {
183+
return composed;
184+
}
185+
186+
if (x > lower) {
187+
low = idx + 1;
188+
continue;
189+
}
190+
191+
if (x < lower) {
192+
high = idx - 1;
193+
continue;
194+
}
195+
}
196+
197+
// If we made it out here, then our scalar was not found in the composition
198+
// array.
199+
// Return the max here to indicate that we couldn't find one.
200+
return std::numeric_limits<__swift_uint32_t>::max();
201+
}
202+
203+
"""
204+
}

0 commit comments

Comments
 (0)