Skip to content

Commit f7dc810

Browse files
committed
Replace force unwraps in UnigramTokenizer
1 parent 4153602 commit f7dc810

File tree

1 file changed

+20
-8
lines changed

1 file changed

+20
-8
lines changed

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,16 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
102102
unknownPiece = SentencePieceToken(token: vocab[unknownTokenId].token, score: minScore - 10)
103103

104104
tokensToIds = Dictionary(uniqueKeysWithValues: vocab.map { $0.token as NSString }.enumerated().map { ($1, $0) })
105-
bosTokenId = tokensToIds[bosToken! as NSString] // May be nil
106-
107-
eosToken = tokenizerConfig.eosToken.string()
108-
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
105+
// bosToken is hardcoded as " " for Unigram tokenizers
106+
bosTokenId = tokensToIds[" " as NSString]
107+
108+
let eos = tokenizerConfig.eosToken.string()
109+
eosToken = eos
110+
if let eos {
111+
eosTokenId = tokensToIds[eos as NSString]
112+
} else {
113+
eosTokenId = nil
114+
}
109115

110116
trie = Trie()
111117
trie.append(contentsOf: vocab.map { $0.token })
@@ -150,10 +156,16 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
150156
unknownPiece = SentencePieceToken(token: vocab[unknownTokenId].token, score: minScore - 10)
151157

152158
tokensToIds = Dictionary(uniqueKeysWithValues: vocab.map { $0.token as NSString }.enumerated().map { ($1, $0) })
153-
bosTokenId = tokensToIds[bosToken! as NSString]
154-
155-
eosToken = tokenizerConfig.eosToken.string()
156-
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
159+
// bosToken is hardcoded as " " for Unigram tokenizers
160+
bosTokenId = tokensToIds[" " as NSString]
161+
162+
let eos = tokenizerConfig.eosToken.string()
163+
eosToken = eos
164+
if let eos {
165+
eosTokenId = tokensToIds[eos as NSString]
166+
} else {
167+
eosTokenId = nil
168+
}
157169

158170
trie = Trie()
159171
trie.append(contentsOf: vocab.map { $0.token })

0 commit comments

Comments
 (0)