@@ -128,7 +128,7 @@ def download_file_with_auth(url, token, save_path):
128128 print (f"chkhsh: { chkhsh } " )
129129
130130 # print the "pre_tokenizer" content from the tokenizer.json
131- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" ) as f :
131+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
132132 cfg = json .load (f )
133133 pre_tokenizer = cfg ["pre_tokenizer" ]
134134 print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
@@ -156,15 +156,19 @@ def download_file_with_auth(url, token, save_path):
156156src_func += "\n "
157157src_func += " res = None\n "
158158src_func += "\n "
159- src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n "
160- src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n "
159+ src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
160+ src_func += " # or pull the latest version of the model from Huggingface\n "
161+ src_func += " # don't edit the hashes manually!\n "
161162src_func += f"{ src_ifs } \n "
162163src_func += " if res is None:\n "
163164src_func += " print(\" \\ n\" )\n "
164165src_func += " print(\" **************************************************************************************\" )\n "
165166src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
166- src_func += " print(\" ** This means that it was not added yet or you are using an older version.\" )\n "
167- src_func += " print(\" ** Check convert-hf-to-gguf-update.py and update it accordingly.\" )\n "
167+ src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
168+ src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
169+ src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
170+ src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
171+ src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
168172src_func += " print(\" **\" )\n "
169173src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
170174src_func += " print(\" **************************************************************************************\" )\n "
@@ -249,7 +253,7 @@ def download_file_with_auth(url, token, save_path):
249253 from transformers import AutoTokenizer
250254 tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
251255
252- with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" ) as f :
256+ with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
253257 for text in tests :
254258 f .write (f"{ text } " )
255259 f .write ("\n __ggml_vocab_test__\n " )
0 commit comments