Skip to content

Commit 4080874

Browse files
committed
Make tokenizer merge rules type-safe
1 parent c50c27a commit 4080874

File tree

3 files changed

+31
-17
lines changed

3 files changed

+31
-17
lines changed

Sources/Hub/Hub.swift

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@ public enum TokenizerVocab: @unchecked Sendable {
2727
case unigram(NSArray)
2828
}
2929

30+
/// Merge rules extracted from tokenizer.json for fast BPE initialization.
31+
///
32+
/// - Note: `@unchecked Sendable` is safe because the underlying data is immutable after extraction from JSON.
33+
public struct TokenizerMerges: @unchecked Sendable {
34+
/// The raw merge rules as extracted from JSON.
35+
public let rules: [Any]
36+
37+
public init(_ rules: [Any]) {
38+
self.rules = rules
39+
}
40+
}
41+
3042
public extension Hub {
3143
/// Errors that can occur during Hub client operations.
3244
///
@@ -142,7 +154,7 @@ public actor LanguageModelConfigurationFromHub {
142154
private var _tokenizerConfig: Config?
143155
private var _tokenizerData: Config?
144156
private var _tokenizerVocab: TokenizerVocab?
145-
private var _tokenizerMerges: [Any]?
157+
private var _tokenizerMerges: TokenizerMerges?
146158

147159
/// Initializes configuration loading from a remote Hub repository.
148160
///
@@ -257,8 +269,8 @@ public actor LanguageModelConfigurationFromHub {
257269
}
258270
}
259271

260-
/// Raw merges array extracted directly from JSON for fast BPE tokenizer initialization.
261-
public var tokenizerMerges: [Any]? {
272+
/// Merge rules extracted directly from JSON for fast BPE tokenizer initialization.
273+
public var tokenizerMerges: TokenizerMerges? {
262274
get async throws {
263275
try await ensureLoaded()
264276
return _tokenizerMerges
@@ -281,7 +293,7 @@ public actor LanguageModelConfigurationFromHub {
281293
var tokenizerConfig: Config?
282294
var tokenizerData: Config
283295
var tokenizerVocab: TokenizerVocab?
284-
var tokenizerMerges: [Any]?
296+
var tokenizerMerges: TokenizerMerges?
285297
}
286298

287299
/// Resolves tokenizerConfig with fallback logic.
@@ -373,7 +385,7 @@ public actor LanguageModelConfigurationFromHub {
373385

374386
// Extract vocab/merges for fast tokenizer initialization (BPE and Unigram)
375387
var tokenizerVocab: TokenizerVocab? = nil
376-
var tokenizerMerges: [Any]? = nil
388+
var tokenizerMerges: TokenizerMerges? = nil
377389

378390
if let modelDict = parsed["model"] as? NSDictionary {
379391
let model = NSMutableDictionary(dictionary: modelDict)
@@ -382,7 +394,9 @@ public actor LanguageModelConfigurationFromHub {
382394
// Only extract and strip for BPE and Unigram models
383395
if modelType == "BPE", let vocab = model["vocab"] as? NSDictionary {
384396
tokenizerVocab = .bpe(vocab)
385-
tokenizerMerges = model["merges"] as? [Any]
397+
if let merges = model["merges"] as? [Any] {
398+
tokenizerMerges = TokenizerMerges(merges)
399+
}
386400

387401
// Only strip if opted in (for backward compatibility)
388402
if stripVocabForPerformance {

Sources/Tokenizers/Tokenizer.swift

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ enum TokenizerModel {
228228
tokenizerData: Config,
229229
addedTokens: [String: Int],
230230
tokenizerVocab: TokenizerVocab?,
231-
tokenizerMerges: [Any]?,
231+
tokenizerMerges: TokenizerMerges?,
232232
strict: Bool = true
233233
) throws -> TokenizingModel {
234234
guard let tokenizerClassName = tokenizerConfig.tokenizerClass.string() else {
@@ -250,7 +250,7 @@ enum TokenizerModel {
250250
// Note: includes empty subclasses (creates BPETokenizer instance)
251251
if tokenizerClass is BPETokenizer.Type,
252252
case .bpe(let rawVocab) = tokenizerVocab,
253-
let rawMerges = tokenizerMerges
253+
let rawMerges = tokenizerMerges?.rules
254254
{
255255
return try BPETokenizer(
256256
tokenizerConfig: tokenizerConfig,
@@ -285,7 +285,7 @@ enum TokenizerModel {
285285
tokenizerData: Config,
286286
addedTokens: [String: Int],
287287
tokenizerVocab: TokenizerVocab?,
288-
tokenizerMerges: [Any]?,
288+
tokenizerMerges: TokenizerMerges?,
289289
strict: Bool = true
290290
) async throws -> TokenizingModel {
291291
guard let tokenizerClassName = tokenizerConfig.tokenizerClass.string() else {
@@ -307,7 +307,7 @@ enum TokenizerModel {
307307
// Note: includes empty subclasses (creates BPETokenizer instance)
308308
if tokenizerClass is BPETokenizer.Type,
309309
case .bpe(let rawVocab) = tokenizerVocab,
310-
let rawMerges = tokenizerMerges
310+
let rawMerges = tokenizerMerges?.rules
311311
{
312312
return await BPETokenizer.createAsync(
313313
tokenizerConfig: tokenizerConfig,
@@ -628,7 +628,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
628628
tokenizerConfig: Config,
629629
tokenizerData: Config,
630630
tokenizerVocab: TokenizerVocab? = nil,
631-
tokenizerMerges: [Any]? = nil,
631+
tokenizerMerges: TokenizerMerges? = nil,
632632
strict: Bool = true
633633
) throws {
634634
var addedTokens: [String: Int] = [:]
@@ -751,7 +751,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
751751
tokenizerConfig: Config,
752752
tokenizerData: Config,
753753
tokenizerVocab: TokenizerVocab?,
754-
tokenizerMerges: [Any]?,
754+
tokenizerMerges: TokenizerMerges?,
755755
strict: Bool = true
756756
) async throws -> PreTrainedTokenizer {
757757
// Parse addedTokens (small data, used for model init)
@@ -1202,7 +1202,7 @@ public extension AutoTokenizer {
12021202
tokenizerConfig: Config,
12031203
tokenizerData: Config,
12041204
tokenizerVocab: TokenizerVocab?,
1205-
tokenizerMerges: [Any]?,
1205+
tokenizerMerges: TokenizerMerges?,
12061206
strict: Bool = true
12071207
) throws -> Tokenizer {
12081208
let tokenizerClass = tokenizerClass(for: tokenizerConfig)
@@ -1220,7 +1220,7 @@ public extension AutoTokenizer {
12201220
tokenizerConfig: Config,
12211221
tokenizerData: Config,
12221222
tokenizerVocab: TokenizerVocab?,
1223-
tokenizerMerges: [Any]?,
1223+
tokenizerMerges: TokenizerMerges?,
12241224
strict: Bool = true
12251225
) async throws -> Tokenizer {
12261226
let selectedClass = tokenizerClass(for: tokenizerConfig)
@@ -1294,7 +1294,7 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
12941294
tokenizerConfig: Config,
12951295
tokenizerData: Config,
12961296
tokenizerVocab: TokenizerVocab? = nil,
1297-
tokenizerMerges: [Any]? = nil,
1297+
tokenizerMerges: TokenizerMerges? = nil,
12981298
strict: Bool = true
12991299
) throws {
13001300
isLegacy = tokenizerConfig.legacy.boolean(or: true)
@@ -1328,7 +1328,7 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
13281328
tokenizerConfig: Config,
13291329
tokenizerData: Config,
13301330
tokenizerVocab: TokenizerVocab?,
1331-
tokenizerMerges: [Any]?,
1331+
tokenizerMerges: TokenizerMerges?,
13321332
strict: Bool = true
13331333
) async throws -> PreTrainedTokenizer {
13341334
let isLegacy = tokenizerConfig.legacy.boolean(or: true)

Tests/HubTests/ConfigTests.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ struct ConfigTests {
540540
// Merges should be extracted
541541
let merges = try await loader.tokenizerMerges
542542
#expect(merges != nil)
543-
#expect(merges?.count == 2)
543+
#expect(merges?.rules.count == 2)
544544

545545
// tokenizerData.model.vocab should be empty (stripped)
546546
let tokenizerData = try await loader.tokenizerData

0 commit comments

Comments
 (0)