@@ -193,6 +193,7 @@ class TokenizerArgs:
193193    tokenizer_path : Optional [Union [Path , str ]] =  None 
194194    is_sentencepiece : bool  =  False 
195195    is_tiktoken : bool  =  False 
196+     is_tokenizers : bool  =  False 
196197    t : Optional [Any ] =  None 
197198
198199    def  __post_init__ (self ):
@@ -202,6 +203,7 @@ def __post_init__(self):
202203            self .t  =  TiktokenTokenizer (model_path = str (self .tokenizer_path ))
203204            self .is_tiktoken  =  True 
204205            self .is_sentencepiece  =  False 
206+             self .is_tokenizers  =  False 
205207            return 
206208        except :
207209            pass 
@@ -212,12 +214,25 @@ def __post_init__(self):
212214            self .t  =  SentencePieceProcessor (model_file = str (self .tokenizer_path ))
213215            self .is_tiktoken  =  False 
214216            self .is_sentencepiece  =  True 
217+             self .is_tokenizers  =  False 
218+             return 
219+         except :
220+             pass 
221+ 
222+         try :
223+             from  tokenizer .tokenizers  import  TokenizersTokenizer 
224+ 
225+             self .t  =  TokenizersTokenizer (str (self .tokenizer_path ))
226+             self .is_tiktoken  =  False 
227+             self .is_sentencepiece  =  False 
228+             self .is_tokenizers  =  True 
215229            return 
216230        except :
217231            pass 
218232
219233        self .is_tiktoken  =  False 
220234        self .is_sentencepiece  =  False 
235+         self .is_tokenizers  =  False 
221236        self .t  =  None 
222237        return 
223238
@@ -229,16 +244,27 @@ def validate_model(
229244        if  model  is  None :
230245            return 
231246
232-         if  self .is_tiktoken   ==  self .is_sentencepiece :
247+         if  len ( list ( filter ( lambda   x :  x , [ self .is_tiktoken ,  self . is_tokenizers ,  self .is_sentencepiece ])))  !=   1 :
233248            raise  RuntimeError (f"no tokenizer was found at { self .tokenizer_path }  " )
234249
235250        is_tiktoken  =  self .is_tiktoken 
236251        is_sentencepiece  =  self .is_sentencepiece 
252+         is_tokenizers  =  self .is_tokenizers 
237253        use_tiktoken  =  model .config .use_tiktoken 
254+         use_tokenizers  =  model .config .use_tokenizers 
255+         use_sentencepiece  =  not  (use_tiktoken  or  use_tokenizers )
238256
239-         if  not  (is_tiktoken  ==  use_tiktoken ) or  not  (is_sentencepiece  !=  use_tiktoken ):
257+         if  (
258+             (is_tiktoken  and  not  use_tiktoken ) or 
259+             (is_tokenizers  and  not  use_tokenizers ) or 
260+             (is_sentencepiece  and  not  use_sentencepiece )
261+         ):
240262            raise  RuntimeError (
241-                 f"model-specified tokenizer ({ tokenizer_setting_to_name (use_tiktoken )}  ) does not match provided tokenizer ({ tokenizer_setting_to_name (is_tiktoken )}  ) for { model_description }  " 
263+                 "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
264+                     tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
265+                     tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
266+                     model_description ,
267+                 )
242268            )
243269
244270        return 
@@ -594,5 +620,9 @@ def _initialize_model(
594620    return  model 
595621
596622
597- def  tokenizer_setting_to_name (tiktoken : bool  =  False ) ->  str :
598-     return  "TikToken"  if  tiktoken  else  "SentencePiece" 
623+ def  tokenizer_setting_to_name (tiktoken : bool , tokenizers : bool ) ->  str :
624+     if  tiktoken :
625+         return  "TikToken" 
626+     if  tokenizers :
627+         return  "Tokenizers" 
628+     return  "SentencePiece" 
0 commit comments