@@ -190,7 +190,7 @@ class TokenizerArgs:
190190 tokenizer_path : Optional [Union [Path , str ]] = None
191191 is_sentencepiece : bool = False
192192 is_tiktoken : bool = False
193- is_tokenizers : bool = False
193+ is_hf_tokenizer : bool = False
194194 t : Optional [Any ] = None
195195
196196 def __post_init__ (self ):
@@ -200,7 +200,7 @@ def __post_init__(self):
200200 self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
201201 self .is_tiktoken = True
202202 self .is_sentencepiece = False
203- self .is_tokenizers = False
203+ self .is_hf_tokenizer = False
204204 return
205205 except :
206206 pass
@@ -211,25 +211,25 @@ def __post_init__(self):
211211 self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
212212 self .is_tiktoken = False
213213 self .is_sentencepiece = True
214- self .is_tokenizers = False
214+ self .is_hf_tokenizer = False
215215 return
216216 except :
217217 pass
218218
219219 try :
220- from tokenizer .tokenizers import TokenizersTokenizer
220+ from tokenizer .hf_tokenizer import HFTokenizer
221221
222- self .t = TokenizersTokenizer (str (self .tokenizer_path ))
222+ self .t = HFTokenizer (str (self .tokenizer_path ))
223223 self .is_tiktoken = False
224224 self .is_sentencepiece = False
225- self .is_tokenizers = True
225+ self .is_hf_tokenizer = True
226226 return
227227 except :
228228 pass
229229
230230 self .is_tiktoken = False
231231 self .is_sentencepiece = False
232- self .is_tokenizers = False
232+ self .is_hf_tokenizer = False
233233 self .t = None
234234 return
235235
@@ -241,25 +241,25 @@ def validate_model(
241241 if model is None :
242242 return
243243
244- if len ( list ( filter ( lambda x : x , [self .is_tiktoken , self .is_tokenizers , self .is_sentencepiece ])) ) != 1 :
244+ if sum ( [self .is_tiktoken , self .is_hf_tokenizer , self .is_sentencepiece ]) != 1 :
245245 raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
246246
247247 is_tiktoken = self .is_tiktoken
248248 is_sentencepiece = self .is_sentencepiece
249- is_tokenizers = self .is_tokenizers
249+ is_hf_tokenizer = self .is_hf_tokenizer
250250 use_tiktoken = model .config .use_tiktoken
251- use_tokenizers = model .config .use_tokenizers
252- use_sentencepiece = not (use_tiktoken or use_tokenizers )
251+ use_hf_tokenizer = model .config .use_hf_tokenizer
252+ use_sentencepiece = not (use_tiktoken or use_hf_tokenizer )
253253
254254 if (
255255 (is_tiktoken and not use_tiktoken ) or
256- (is_tokenizers and not use_tokenizers ) or
256+ (is_hf_tokenizer and not use_hf_tokenizer ) or
257257 (is_sentencepiece and not use_sentencepiece )
258258 ):
259259 raise RuntimeError (
260260 "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
261- tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
262- tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
261+ tokenizer_setting_to_name (use_tiktoken , use_hf_tokenizer ),
262+ tokenizer_setting_to_name (is_tiktoken , is_hf_tokenizer ),
263263 model_description ,
264264 )
265265 )
0 commit comments