Skip to content

Commit 289d6aa

Browse files
committed
fix: load EOS token from model config.json as fallback
Qwen and other models store eos_token_id in config.json rather than tokenizer_config.json. Now we check both files, preferring tokenizer_config but falling back to model config. This fixes the issue where the model wouldn't stop generating at the correct EOS token.
1 parent 6ba46db commit 289d6aa

File tree

1 file changed

+25
-11
lines changed

1 file changed

+25
-11
lines changed

packages/swift/Sources/NodeMLXCore/Tokenizer.swift

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,32 @@ public class HFTokenizer: TokenizerProtocol, @unchecked Sendable {
4242
// Load using swift-transformers AutoTokenizer
4343
self.tokenizer = try await AutoTokenizer.from(modelFolder: modelDirectory)
4444

45-
// Extract special tokens from config
46-
let configURL = modelDirectory.appendingPathComponent("tokenizer_config.json")
47-
if let configData = try? Data(contentsOf: configURL),
48-
let config = try? JSONSerialization.jsonObject(with: configData) as? [String: Any] {
49-
self.bosTokenId = config["bos_token_id"] as? Int
50-
self.eosTokenId = config["eos_token_id"] as? Int
51-
self.padTokenId = config["pad_token_id"] as? Int
52-
} else {
53-
self.bosTokenId = nil
54-
self.eosTokenId = nil
55-
self.padTokenId = nil
45+
// Extract special tokens - try tokenizer_config.json first, then fallback to config.json
46+
var bos: Int? = nil
47+
var eos: Int? = nil
48+
var pad: Int? = nil
49+
50+
// Try tokenizer_config.json
51+
let tokenizerConfigURL = modelDirectory.appendingPathComponent("tokenizer_config.json")
52+
if let data = try? Data(contentsOf: tokenizerConfigURL),
53+
let config = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
54+
bos = config["bos_token_id"] as? Int
55+
eos = config["eos_token_id"] as? Int
56+
pad = config["pad_token_id"] as? Int
5657
}
58+
59+
// Fallback to config.json (model config) for any missing values
60+
let modelConfigURL = modelDirectory.appendingPathComponent("config.json")
61+
if let data = try? Data(contentsOf: modelConfigURL),
62+
let config = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
63+
if bos == nil { bos = config["bos_token_id"] as? Int }
64+
if eos == nil { eos = config["eos_token_id"] as? Int }
65+
if pad == nil { pad = config["pad_token_id"] as? Int }
66+
}
67+
68+
self.bosTokenId = bos
69+
self.eosTokenId = eos
70+
self.padTokenId = pad
5771
}
5872

5973
/// Load tokenizer from HuggingFace Hub model ID

0 commit comments

Comments
 (0)