@@ -102,10 +102,16 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
102102 unknownPiece = SentencePieceToken ( token: vocab [ unknownTokenId] . token, score: minScore - 10 )
103103
104104 tokensToIds = Dictionary ( uniqueKeysWithValues: vocab. map { $0. token as NSString } . enumerated ( ) . map { ( $1, $0) } )
105- bosTokenId = tokensToIds [ bosToken! as NSString ] // May be nil
106-
107- eosToken = tokenizerConfig. eosToken. string ( )
108- eosTokenId = eosToken == nil ? nil : tokensToIds [ eosToken! as NSString ]
105+ // bosToken is hardcoded as " " for Unigram tokenizers
106+ bosTokenId = tokensToIds [ " " as NSString ]
107+
108+ let eos = tokenizerConfig. eosToken. string ( )
109+ eosToken = eos
110+ if let eos {
111+ eosTokenId = tokensToIds [ eos as NSString ]
112+ } else {
113+ eosTokenId = nil
114+ }
109115
110116 trie = Trie ( )
111117 trie. append ( contentsOf: vocab. map { $0. token } )
@@ -150,10 +156,16 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
150156 unknownPiece = SentencePieceToken ( token: vocab [ unknownTokenId] . token, score: minScore - 10 )
151157
152158 tokensToIds = Dictionary ( uniqueKeysWithValues: vocab. map { $0. token as NSString } . enumerated ( ) . map { ( $1, $0) } )
153- bosTokenId = tokensToIds [ bosToken! as NSString ]
154-
155- eosToken = tokenizerConfig. eosToken. string ( )
156- eosTokenId = eosToken == nil ? nil : tokensToIds [ eosToken! as NSString ]
159+ // bosToken is hardcoded as " " for Unigram tokenizers
160+ bosTokenId = tokensToIds [ " " as NSString ]
161+
162+ let eos = tokenizerConfig. eosToken. string ( )
163+ eosToken = eos
164+ if let eos {
165+ eosTokenId = tokensToIds [ eos as NSString ]
166+ } else {
167+ eosTokenId = nil
168+ }
157169
158170 trie = Trie ( )
159171 trie. append ( contentsOf: vocab. map { $0. token } )
0 commit comments