Skip to content

Commit 4e611a9

Browse files
committed
Improvements for yyjson
1 parent d8f12f3 commit 4e611a9

File tree

4 files changed

+250
-81
lines changed

4 files changed

+250
-81
lines changed

Sources/Hub/HubApi.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,14 @@ public extension HubApi {
325325
/// `fileURL` is a complete local file path for the given model
326326
func configuration(fileURL: URL) throws -> Config {
327327
let data = try Data(contentsOf: fileURL)
328-
return try YYJSONParser.bomPreservingParseToConfig(data)
328+
do {
329+
return try YYJSONParser.bomPreservingParseToConfig(data)
330+
} catch {
331+
throw Hub.HubClientError.jsonSerialization(
332+
fileURL: fileURL,
333+
message: "JSON parsing failed for \(fileURL): \(error.localizedDescription). If this is a private model, verify that HF_TOKEN is set."
334+
)
335+
}
329336
}
330337
}
331338

Sources/Hub/YYJSONParser.swift

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ enum YYJSONParser {
4646
let doc = yyjson_read_opts(
4747
UnsafeMutableRawPointer(mutating: baseAddress).assumingMemoryBound(to: CChar.self),
4848
buffer.count,
49-
YYJSON_READ_ALLOW_BOM,
49+
0,
5050
nil,
5151
&err
5252
)
@@ -66,12 +66,15 @@ enum YYJSONParser {
6666
}
6767
}
6868

69-
/// Parses JSON data with BOM handling directly into a Config object.
69+
/// Parses JSON data into a Config object, preserving BOM characters in strings.
70+
///
71+
/// Unlike Foundation's `JSONSerialization`, yyjson correctly preserves BOM
72+
/// characters (`\u{feff}`) within string values. This matters for tokenizers
73+
/// like Gemma that use BOM as a token prefix (e.g., `"\u{feff}#"`).
7074
///
71-
/// Note: yyjson with YYJSON_READ_ALLOW_BOM handles BOM correctly without
72-
/// the duplication workaround needed for JSONSerialization.
75+
/// See: https://github.com/huggingface/swift-transformers/issues/116
7376
static func bomPreservingParseToConfig(_ data: Data) throws -> Config {
74-
return try parseToConfig(data)
77+
try parseToConfig(data)
7578
}
7679

7780
/// Parses JSON data into an NSDictionary.
@@ -187,31 +190,28 @@ enum YYJSONParser {
187190
// MARK: - Direct Config conversion
188191

189192
private static func convertToConfig(_ val: UnsafeMutablePointer<yyjson_val>) -> Config {
190-
let tag = yyjson_get_tag(val)
191-
let type = tag & 0x07
192-
let subtype = tag & 0x18
193-
194-
switch type {
195-
case 0x02: // YYJSON_TYPE_NULL
193+
if yyjson_is_null(val) {
196194
return Config()
197-
case 0x03: // YYJSON_TYPE_BOOL
198-
return Config(subtype == 0x08)
199-
case 0x04: // YYJSON_TYPE_NUM
200-
if subtype == 0x00 { // YYJSON_SUBTYPE_UINT
201-
return Config(Int(yyjson_get_uint(val)))
202-
} else if subtype == 0x08 { // YYJSON_SUBTYPE_SINT
203-
return Config(Int(yyjson_get_sint(val)))
204-
} else { // YYJSON_SUBTYPE_REAL
205-
return Config(Float(yyjson_get_real(val)))
195+
} else if yyjson_is_bool(val) {
196+
return Config(yyjson_get_bool(val))
197+
} else if yyjson_is_uint(val) {
198+
let uintVal = yyjson_get_uint(val)
199+
if uintVal > UInt64(Int.max) {
200+
return Config(Float(uintVal))
206201
}
207-
case 0x05: // YYJSON_TYPE_STR
202+
return Config(Int(uintVal))
203+
} else if yyjson_is_sint(val) {
204+
return Config(Int(yyjson_get_sint(val)))
205+
} else if yyjson_is_real(val) {
206+
return Config(Float(yyjson_get_real(val)))
207+
} else if yyjson_is_str(val) {
208208
guard let str = yyjson_get_str(val) else { return Config("") }
209209
return Config(String(cString: str))
210-
case 0x06: // YYJSON_TYPE_ARR
210+
} else if yyjson_is_arr(val) {
211211
return convertArrayToConfig(val)
212-
case 0x07: // YYJSON_TYPE_OBJ
212+
} else if yyjson_is_obj(val) {
213213
return convertObjectToConfig(val)
214-
default:
214+
} else {
215215
return Config()
216216
}
217217
}

Tests/Benchmarks/JSONParserBenchmarkTests.swift

Lines changed: 137 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
// Benchmark tests comparing JSONSerialization vs yyjson performance.
66
//
77

8+
import Dispatch
89
import Foundation
910
import Testing
1011
import Tokenizers
@@ -35,58 +36,146 @@ struct JSONParserBenchmarkTests {
3536
offlineHubApi = HubApi(useOfflineMode: true)
3637
}
3738

39+
// MARK: - Benchmark Utilities
40+
41+
struct BenchmarkStats {
42+
let mean: Double
43+
let stdDev: Double
44+
let min: Double
45+
let max: Double
46+
47+
var formatted: String {
48+
String(format: "%.1f ms (± %.1f)", mean, stdDev)
49+
}
50+
}
51+
52+
/// Measures execution time using monotonic clock, returning individual timings in milliseconds.
53+
private func measure(
54+
label: String,
55+
labelWidth: Int,
56+
iterations: Int,
57+
warmup: Int = 2,
58+
_ block: () throws -> Void
59+
) rethrows -> [Double] {
60+
let paddedLabel = label.padding(toLength: labelWidth, withPad: " ", startingAt: 0)
61+
print("\(paddedLabel) ", terminator: "")
62+
fflush(stdout)
63+
64+
// Warmup runs (not measured)
65+
for _ in 0..<warmup {
66+
try block()
67+
}
68+
69+
var times: [Double] = []
70+
times.reserveCapacity(iterations)
71+
72+
for i in 0..<iterations {
73+
let start = DispatchTime.now()
74+
try block()
75+
let end = DispatchTime.now()
76+
let nanoseconds = end.uptimeNanoseconds - start.uptimeNanoseconds
77+
times.append(Double(nanoseconds) / 1_000_000)
78+
79+
if (i + 1) % 10 == 0 {
80+
print(String(format: "%2d", i + 1), terminator: "")
81+
} else {
82+
print(".", terminator: "")
83+
}
84+
fflush(stdout)
85+
}
86+
87+
let mean = times.reduce(0, +) / Double(times.count)
88+
print(String(format: " %6.1f ms", mean))
89+
90+
return times
91+
}
92+
93+
/// Async version of measure for async operations.
94+
private func measureAsync(
95+
label: String,
96+
labelWidth: Int,
97+
iterations: Int,
98+
warmup: Int = 2,
99+
_ block: () async throws -> Void
100+
) async rethrows -> [Double] {
101+
let paddedLabel = label.padding(toLength: labelWidth, withPad: " ", startingAt: 0)
102+
print("\(paddedLabel) ", terminator: "")
103+
fflush(stdout)
104+
105+
// Warmup runs (not measured)
106+
for _ in 0..<warmup {
107+
try await block()
108+
}
109+
110+
var times: [Double] = []
111+
times.reserveCapacity(iterations)
112+
113+
for i in 0..<iterations {
114+
let start = DispatchTime.now()
115+
try await block()
116+
let end = DispatchTime.now()
117+
let nanoseconds = end.uptimeNanoseconds - start.uptimeNanoseconds
118+
times.append(Double(nanoseconds) / 1_000_000)
119+
120+
if (i + 1) % 5 == 0 {
121+
print(String(format: "%2d", i + 1), terminator: "")
122+
} else {
123+
print(".", terminator: "")
124+
}
125+
fflush(stdout)
126+
}
127+
128+
let mean = times.reduce(0, +) / Double(times.count)
129+
print(String(format: " %6.1f ms", mean))
130+
131+
return times
132+
}
133+
134+
private func stats(_ times: [Double]) -> BenchmarkStats {
135+
let mean = times.reduce(0, +) / Double(times.count)
136+
let variance = times.map { ($0 - mean) * ($0 - mean) }.reduce(0, +) / Double(times.count)
137+
let stdDev = sqrt(variance)
138+
let min = times.min() ?? 0
139+
let max = times.max() ?? 0
140+
return BenchmarkStats(mean: mean, stdDev: stdDev, min: min, max: max)
141+
}
142+
38143
@Test
39144
func compareParsingSpeed() throws {
40-
let iterations = 10
145+
let iterations = 50
146+
let labelWidth = 25
41147

42-
print("Warming up...")
43-
let _ = try YYJSONParser.parseToConfig(benchmarkData)
44-
let _ = try JSONSerialization.jsonObject(with: benchmarkData, options: [])
148+
print("Benchmarking with \(iterations) iterations...\n")
45149

46-
print("Benchmarking yyjson raw parsing...")
47-
let yyjsonRawStart = CFAbsoluteTimeGetCurrent()
48-
for _ in 0..<iterations {
150+
let yyjsonRawTimes = measure(label: "yyjson (raw)", labelWidth: labelWidth, iterations: iterations) {
49151
benchmarkData.withUnsafeBytes { buffer in
50152
let doc = yyjson_read(buffer.baseAddress?.assumingMemoryBound(to: CChar.self), buffer.count, 0)
51153
yyjson_doc_free(doc)
52154
}
53155
}
54-
let yyjsonRawTime = CFAbsoluteTimeGetCurrent() - yyjsonRawStart
55156

56-
print("Benchmarking yyjson → Config...")
57-
let yyjsonDirectStart = CFAbsoluteTimeGetCurrent()
58-
for _ in 0..<iterations {
157+
let yyjsonConfigTimes = try measure(label: "yyjson -> Config", labelWidth: labelWidth, iterations: iterations) {
59158
let _ = try YYJSONParser.parseToConfig(benchmarkData)
60159
}
61-
let yyjsonDirectTime = CFAbsoluteTimeGetCurrent() - yyjsonDirectStart
62160

63-
print("Benchmarking JSONSerialization raw parsing...")
64-
let jsonSerRawStart = CFAbsoluteTimeGetCurrent()
65-
for _ in 0..<iterations {
161+
let jsonSerRawTimes = try measure(label: "JSONSerialization (raw)", labelWidth: labelWidth, iterations: iterations) {
66162
let _ = try JSONSerialization.jsonObject(with: benchmarkData, options: [])
67163
}
68-
let jsonSerRawTime = CFAbsoluteTimeGetCurrent() - jsonSerRawStart
69164

70-
print("Benchmarking JSONSerialization → Config...")
71-
let jsonSerFullStart = CFAbsoluteTimeGetCurrent()
72-
for _ in 0..<iterations {
165+
let jsonSerConfigTimes = try measure(label: "JSONSerialization+Config", labelWidth: labelWidth, iterations: iterations) {
73166
let parsed = try JSONSerialization.jsonObject(with: benchmarkData, options: [])
74167
let _ = Config(parsed as! [NSString: Any])
75168
}
76-
let jsonSerFullTime = CFAbsoluteTimeGetCurrent() - jsonSerFullStart
77-
78-
let rawSpeedup = jsonSerRawTime / yyjsonRawTime
79-
let directSpeedup = jsonSerFullTime / yyjsonDirectTime
80169

81-
// Calculate average time per operation (in milliseconds)
82-
let yyjsonRawAvg = (yyjsonRawTime / Double(iterations)) * 1000
83-
let yyjsonDirectAvg = (yyjsonDirectTime / Double(iterations)) * 1000
84-
let jsonSerRawAvg = (jsonSerRawTime / Double(iterations)) * 1000
85-
let jsonSerFullAvg = (jsonSerFullTime / Double(iterations)) * 1000
170+
let yyjsonRawStats = stats(yyjsonRawTimes)
171+
let yyjsonConfigStats = stats(yyjsonConfigTimes)
172+
let jsonSerRawStats = stats(jsonSerRawTimes)
173+
let jsonSerConfigStats = stats(jsonSerConfigTimes)
86174

87-
// Time saved per operation
88-
let rawTimeSaved = jsonSerRawAvg - yyjsonRawAvg
89-
let directTimeSaved = jsonSerFullAvg - yyjsonDirectAvg
175+
let rawSpeedup = jsonSerRawStats.mean / yyjsonRawStats.mean
176+
let configSpeedup = jsonSerConfigStats.mean / yyjsonConfigStats.mean
177+
let rawTimeSaved = jsonSerRawStats.mean - yyjsonRawStats.mean
178+
let configTimeSaved = jsonSerConfigStats.mean - yyjsonConfigStats.mean
90179

91180
print(
92181
"""
@@ -95,13 +184,13 @@ struct JSONParserBenchmarkTests {
95184
JSON Parsing Benchmark Results (\(iterations) iterations)
96185
File size: \(ByteCountFormatter.string(fromByteCount: Int64(benchmarkData.count), countStyle: .file))
97186
============================================
98-
yyjson (raw parse): \(String(format: "%.3f", yyjsonRawTime))s (\(String(format: "%.1f", yyjsonRawAvg)) ms avg)
99-
yyjson Config: \(String(format: "%.3f", yyjsonDirectTime))s (\(String(format: "%.1f", yyjsonDirectAvg)) ms avg)
100-
JSONSerialization (raw): \(String(format: "%.3f", jsonSerRawTime))s (\(String(format: "%.1f", jsonSerRawAvg)) ms avg)
101-
JSONSerialization+Config: \(String(format: "%.3f", jsonSerFullTime))s (\(String(format: "%.1f", jsonSerFullAvg)) ms avg)
187+
yyjson (raw parse): \(yyjsonRawStats.formatted)
188+
yyjson -> Config: \(yyjsonConfigStats.formatted)
189+
JSONSerialization (raw): \(jsonSerRawStats.formatted)
190+
JSONSerialization+Config: \(jsonSerConfigStats.formatted)
102191
--------------------------------------------
103192
Raw parse speedup: \(String(format: "%.2f", rawSpeedup))x (\(String(format: "%.0f", rawTimeSaved)) ms saved)
104-
Full path speedup: \(String(format: "%.2f", directSpeedup))x (\(String(format: "%.0f", directTimeSaved)) ms saved)
193+
Full path speedup: \(String(format: "%.2f", configSpeedup))x (\(String(format: "%.0f", configTimeSaved)) ms saved)
105194
============================================
106195
107196
""")
@@ -130,31 +219,23 @@ struct JSONParserBenchmarkTests {
130219

131220
@Test
132221
func compareTokenizerLoadingSpeed() async throws {
133-
let iterations = 5
222+
let iterations = 20
223+
let labelWidth = 18
134224

135-
print("Warming up...")
136-
let _ = try await AutoTokenizer.from(modelFolder: modelFolder, hubApi: offlineHubApi)
225+
print("Benchmarking tokenizer loading with \(iterations) iterations...\n")
137226

138-
print("Benchmarking tokenizer loading with yyjson...")
139-
let yyjsonStart = CFAbsoluteTimeGetCurrent()
140-
for _ in 0..<iterations {
227+
let yyjsonTimes = try await measureAsync(label: "yyjson (current)", labelWidth: labelWidth, iterations: iterations) {
141228
let _ = try await AutoTokenizer.from(modelFolder: modelFolder, hubApi: offlineHubApi)
142229
}
143-
let yyjsonTime = CFAbsoluteTimeGetCurrent() - yyjsonStart
144230

145-
print("Benchmarking tokenizer loading with JSONSerialization...")
146-
let jsonSerStart = CFAbsoluteTimeGetCurrent()
147-
for _ in 0..<iterations {
231+
let jsonSerTimes = try await measureAsync(label: "JSONSerialization", labelWidth: labelWidth, iterations: iterations) {
148232
let _ = try await loadTokenizerWithJSONSerialization()
149233
}
150-
let jsonSerTime = CFAbsoluteTimeGetCurrent() - jsonSerStart
151-
152-
let speedup = jsonSerTime / yyjsonTime
153234

154-
// Calculate average time per load (in milliseconds)
155-
let yyjsonAvg = (yyjsonTime / Double(iterations)) * 1000
156-
let jsonSerAvg = (jsonSerTime / Double(iterations)) * 1000
157-
let timeSaved = jsonSerAvg - yyjsonAvg
235+
let yyjsonStats = stats(yyjsonTimes)
236+
let jsonSerStats = stats(jsonSerTimes)
237+
let speedup = jsonSerStats.mean / yyjsonStats.mean
238+
let timeSaved = jsonSerStats.mean - yyjsonStats.mean
158239

159240
print(
160241
"""
@@ -163,10 +244,10 @@ struct JSONParserBenchmarkTests {
163244
Tokenizer Loading Benchmark (\(iterations) iterations)
164245
Model: \(Self.modelId)
165246
============================================
166-
yyjson (current): \(String(format: "%.3f", yyjsonTime))s (\(String(format: "%.0f", yyjsonAvg)) ms avg)
167-
JSONSerialization: \(String(format: "%.3f", jsonSerTime))s (\(String(format: "%.0f", jsonSerAvg)) ms avg)
247+
yyjson (current): \(yyjsonStats.formatted)
248+
JSONSerialization: \(jsonSerStats.formatted)
168249
--------------------------------------------
169-
Speedup: \(String(format: "%.2f", speedup))x faster with yyjson (\(String(format: "%.0f", timeSaved)) ms saved)
250+
Speedup: \(String(format: "%.2f", speedup))x faster (\(String(format: "%.0f", timeSaved)) ms saved)
170251
============================================
171252
172253
""")

0 commit comments

Comments
 (0)