@@ -204,7 +204,7 @@ class TokenizerArgs:
204204 tokenizer_path : Optional [Union [Path , str ]] = None
205205 is_sentencepiece : bool = False
206206 is_tiktoken : bool = False
207- is_tokenizers : bool = False
207+ is_hf_tokenizer : bool = False
208208 t : Optional [Any ] = None
209209
210210 def __post_init__ (self ):
@@ -214,7 +214,7 @@ def __post_init__(self):
214214 self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
215215 self .is_tiktoken = True
216216 self .is_sentencepiece = False
217- self .is_tokenizers = False
217+ self .is_hf_tokenizer = False
218218 return
219219 except :
220220 pass
@@ -225,25 +225,25 @@ def __post_init__(self):
225225 self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
226226 self .is_tiktoken = False
227227 self .is_sentencepiece = True
228- self .is_tokenizers = False
228+ self .is_hf_tokenizer = False
229229 return
230230 except :
231231 pass
232232
233233 try :
234- from tokenizer .tokenizers import TokenizersTokenizer
234+ from tokenizer .hf_tokenizer import HFTokenizer
235235
236- self .t = TokenizersTokenizer (str (self .tokenizer_path ))
236+ self .t = HFTokenizer (str (self .tokenizer_path ))
237237 self .is_tiktoken = False
238238 self .is_sentencepiece = False
239- self .is_tokenizers = True
239+ self .is_hf_tokenizer = True
240240 return
241241 except :
242242 pass
243243
244244 self .is_tiktoken = False
245245 self .is_sentencepiece = False
246- self .is_tokenizers = False
246+ self .is_hf_tokenizer = False
247247 self .t = None
248248 return
249249
@@ -255,25 +255,25 @@ def validate_model(
255255 if model is None :
256256 return
257257
258- if len ( list ( filter ( lambda x : x , [self .is_tiktoken , self .is_tokenizers , self .is_sentencepiece ])) ) != 1 :
258+ if sum ( [self .is_tiktoken , self .is_hf_tokenizer , self .is_sentencepiece ]) != 1 :
259259 raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
260260
261261 is_tiktoken = self .is_tiktoken
262262 is_sentencepiece = self .is_sentencepiece
263- is_tokenizers = self .is_tokenizers
263+ is_hf_tokenizer = self .is_hf_tokenizer
264264 use_tiktoken = model .config .use_tiktoken
265- use_tokenizers = model .config .use_tokenizers
266- use_sentencepiece = not (use_tiktoken or use_tokenizers )
265+ use_hf_tokenizer = model .config .use_hf_tokenizer
266+ use_sentencepiece = not (use_tiktoken or use_hf_tokenizer )
267267
268268 if (
269269 (is_tiktoken and not use_tiktoken ) or
270- (is_tokenizers and not use_tokenizers ) or
270+ (is_hf_tokenizer and not use_hf_tokenizer ) or
271271 (is_sentencepiece and not use_sentencepiece )
272272 ):
273273 raise RuntimeError (
274274 "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
275- tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
276- tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
275+ tokenizer_setting_to_name (use_tiktoken , use_hf_tokenizer ),
276+ tokenizer_setting_to_name (is_tiktoken , is_hf_tokenizer ),
277277 model_description ,
278278 )
279279 )
0 commit comments