3131from enum import IntEnum , auto
3232from transformers import AutoTokenizer
3333
34+ logging .basicConfig (level = logging .DEBUG )
3435logger = logging .getLogger ("convert-hf-to-gguf-update" )
3536
3637
@@ -62,6 +63,7 @@ class TOKENIZER_TYPE(IntEnum):
6263 {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
6364 {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
6465 {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
66+ {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
6567]
6668
6769# make directory "models/tokenizers" if it doesn't exist
@@ -158,8 +160,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
158160 chktok = tokenizer.encode(chktxt)
159161 chkhsh = sha256(str(chktok).encode()).hexdigest()
160162
161- print (f"chktok: {{chktok}}")
162- print (f"chkhsh: {{chkhsh}}")
163+ logger.debug (f"chktok: {{chktok}}")
164+ logger.debug (f"chkhsh: {{chkhsh}}")
163165
164166 res = None
165167
@@ -168,22 +170,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
168170 # don't edit the hashes manually!
169171{ src_ifs }
170172 if res is None:
171- print ("\\ n")
172- print ("**************************************************************************************")
173- print ("** WARNING: The BPE pre-tokenizer was not recognized!")
174- print ("** There are 2 possible reasons for this:")
175- print ("** - the model has not been added to convert-hf-to-gguf-update.py yet")
176- print ("** - the pre-tokenization config has changed upstream")
177- print ("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
178- print ("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
179- print ("**")
180- print (f"** chkhsh: {{chkhsh}}")
181- print ("**************************************************************************************")
182- print ("\\ n")
173+ logger.warning ("\\ n")
174+ logger.warning ("**************************************************************************************")
175+ logger.warning ("** WARNING: The BPE pre-tokenizer was not recognized!")
176+ logger.warning ("** There are 2 possible reasons for this:")
177+ logger.warning ("** - the model has not been added to convert-hf-to-gguf-update.py yet")
178+ logger.warning ("** - the pre-tokenization config has changed upstream")
179+ logger.warning ("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
180+ logger.warning ("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
181+ logger.warning ("**")
182+ logger.warning (f"** chkhsh: {{chkhsh}}")
183+ logger.warning ("**************************************************************************************")
184+ logger.warning ("\\ n")
183185 raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
184186
185- print (f"tokenizer.ggml.pre: {{repr(res)}}")
186- print (f"chkhsh: {{chkhsh}}")
187+ logger.debug (f"tokenizer.ggml.pre: {{repr(res)}}")
188+ logger.debug (f"chkhsh: {{chkhsh}}")
187189
188190 return res
189191"""
@@ -197,6 +199,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
197199# generate tests for each tokenizer model
198200
199201tests = [
202+ "ied 4 ½ months" ,
203+ "Führer" ,
200204 "" ,
201205 " " ,
202206 " " ,
@@ -281,6 +285,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
281285for model in models :
282286 name = model ["name" ]
283287
284- logger . info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
288+ print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" ) # noqa: NP100
285289
286290logger .info ("\n " )
0 commit comments