@@ -204,6 +204,7 @@ class TokenizerArgs:
204204 tokenizer_path : Optional [Union [Path , str ]] = None
205205 is_sentencepiece : bool = False
206206 is_tiktoken : bool = False
207+ is_tokenizers : bool = False
207208 t : Optional [Any ] = None
208209
209210 def __post_init__ (self ):
@@ -213,6 +214,7 @@ def __post_init__(self):
213214 self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
214215 self .is_tiktoken = True
215216 self .is_sentencepiece = False
217+ self .is_tokenizers = False
216218 return
217219 except :
218220 pass
@@ -223,12 +225,25 @@ def __post_init__(self):
223225 self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
224226 self .is_tiktoken = False
225227 self .is_sentencepiece = True
228+ self .is_tokenizers = False
229+ return
230+ except :
231+ pass
232+
233+ try :
234+ from tokenizer .tokenizers import TokenizersTokenizer
235+
236+ self .t = TokenizersTokenizer (str (self .tokenizer_path ))
237+ self .is_tiktoken = False
238+ self .is_sentencepiece = False
239+ self .is_tokenizers = True
226240 return
227241 except :
228242 pass
229243
230244 self .is_tiktoken = False
231245 self .is_sentencepiece = False
246+ self .is_tokenizers = False
232247 self .t = None
233248 return
234249
@@ -240,16 +255,27 @@ def validate_model(
240255 if model is None :
241256 return
242257
243- if self .is_tiktoken == self .is_sentencepiece :
258+ if len ( list ( filter ( lambda x : x , [ self .is_tiktoken , self . is_tokenizers , self .is_sentencepiece ]))) != 1 :
244259 raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
245260
246261 is_tiktoken = self .is_tiktoken
247262 is_sentencepiece = self .is_sentencepiece
263+ is_tokenizers = self .is_tokenizers
248264 use_tiktoken = model .config .use_tiktoken
265+ use_tokenizers = model .config .use_tokenizers
266+ use_sentencepiece = not (use_tiktoken or use_tokenizers )
249267
250- if not (is_tiktoken == use_tiktoken ) or not (is_sentencepiece != use_tiktoken ):
268+ if (
269+ (is_tiktoken and not use_tiktoken ) or
270+ (is_tokenizers and not use_tokenizers ) or
271+ (is_sentencepiece and not use_sentencepiece )
272+ ):
251273 raise RuntimeError (
252- f"model-specified tokenizer ({ tokenizer_setting_to_name (use_tiktoken )} ) does not match provided tokenizer ({ tokenizer_setting_to_name (is_tiktoken )} ) for { model_description } "
274+ "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
275+ tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
276+ tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
277+ model_description ,
278+ )
253279 )
254280
255281 return
@@ -605,5 +631,9 @@ def _initialize_model(
605631 return model
606632
607633
608- def tokenizer_setting_to_name (tiktoken : bool = False ) -> str :
609- return "TikToken" if tiktoken else "SentencePiece"
634+ def tokenizer_setting_to_name (tiktoken : bool , tokenizers : bool ) -> str :
635+ if tiktoken :
636+ return "TikToken"
637+ if tokenizers :
638+ return "Tokenizers"
639+ return "SentencePiece"
0 commit comments