@@ -193,6 +193,7 @@ class TokenizerArgs:
193193 tokenizer_path : Optional [Union [Path , str ]] = None
194194 is_sentencepiece : bool = False
195195 is_tiktoken : bool = False
196+ is_tokenizers : bool = False
196197 t : Optional [Any ] = None
197198
198199 def __post_init__ (self ):
@@ -202,6 +203,7 @@ def __post_init__(self):
202203 self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
203204 self .is_tiktoken = True
204205 self .is_sentencepiece = False
206+ self .is_tokenizers = False
205207 return
206208 except :
207209 pass
@@ -212,12 +214,25 @@ def __post_init__(self):
212214 self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
213215 self .is_tiktoken = False
214216 self .is_sentencepiece = True
217+ self .is_tokenizers = False
218+ return
219+ except :
220+ pass
221+
222+ try :
223+ from tokenizer .tokenizers import TokenizersTokenizer
224+
225+ self .t = TokenizersTokenizer (str (self .tokenizer_path ))
226+ self .is_tiktoken = False
227+ self .is_sentencepiece = False
228+ self .is_tokenizers = True
215229 return
216230 except :
217231 pass
218232
219233 self .is_tiktoken = False
220234 self .is_sentencepiece = False
235+ self .is_tokenizers = False
221236 self .t = None
222237 return
223238
@@ -229,16 +244,27 @@ def validate_model(
229244 if model is None :
230245 return
231246
232- if self .is_tiktoken == self .is_sentencepiece :
247+ if len ( list ( filter ( lambda x : x , [ self .is_tiktoken , self . is_tokenizers , self .is_sentencepiece ]))) != 1 :
233248 raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
234249
235250 is_tiktoken = self .is_tiktoken
236251 is_sentencepiece = self .is_sentencepiece
252+ is_tokenizers = self .is_tokenizers
237253 use_tiktoken = model .config .use_tiktoken
254+ use_tokenizers = model .config .use_tokenizers
255+ use_sentencepiece = not (use_tiktoken or use_tokenizers )
238256
239- if not (is_tiktoken == use_tiktoken ) or not (is_sentencepiece != use_tiktoken ):
257+ if (
258+ (is_tiktoken and not use_tiktoken ) or
259+ (is_tokenizers and not use_tokenizers ) or
260+ (is_sentencepiece and not use_sentencepiece )
261+ ):
240262 raise RuntimeError (
241- f"model-specified tokenizer ({ tokenizer_setting_to_name (use_tiktoken )} ) does not match provided tokenizer ({ tokenizer_setting_to_name (is_tiktoken )} ) for { model_description } "
263+ "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
264+ tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
265+ tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
266+ model_description ,
267+ )
242268 )
243269
244270 return
@@ -594,5 +620,9 @@ def _initialize_model(
594620 return model
595621
596622
597- def tokenizer_setting_to_name (tiktoken : bool = False ) -> str :
598- return "TikToken" if tiktoken else "SentencePiece"
623+ def tokenizer_setting_to_name (tiktoken : bool , tokenizers : bool ) -> str :
624+ if tiktoken :
625+ return "TikToken"
626+ if tokenizers :
627+ return "Tokenizers"
628+ return "SentencePiece"
0 commit comments