|
20 | 20 | # - Update llama.cpp with the new pre-tokenizer if necessary |
21 | 21 | # |
22 | 22 | # TODO: generate tokenizer tests for llama.cpp |
23 | | -# TODO: automate the update of convert-hf-to-gguf.py |
24 | 23 | # |
25 | 24 |
|
26 | 25 | import logging |
27 | 26 | import os |
| 27 | +import pathlib |
| 28 | +import re |
| 29 | + |
28 | 30 | import requests |
29 | 31 | import sys |
30 | 32 | import json |
|
35 | 37 |
|
36 | 38 | logging.basicConfig(level=logging.DEBUG) |
37 | 39 | logger = logging.getLogger("convert-hf-to-gguf-update") |
| 40 | +sess = requests.Session() |
38 | 41 |
|
39 | 42 |
|
40 | 43 | class TOKENIZER_TYPE(IntEnum): |
@@ -79,63 +82,44 @@ class TOKENIZER_TYPE(IntEnum): |
79 | 82 | {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, |
80 | 83 | ] |
81 | 84 |
|
82 | | -# make directory "models/tokenizers" if it doesn't exist |
83 | | -if not os.path.exists("models/tokenizers"): |
84 | | - os.makedirs("models/tokenizers") |
85 | | - |
86 | 85 |
|
87 | 86 | def download_file_with_auth(url, token, save_path): |
88 | 87 | headers = {"Authorization": f"Bearer {token}"} |
89 | | - response = requests.get(url, headers=headers) |
90 | | - if response.status_code == 200: |
91 | | - with open(save_path, 'wb') as f: |
92 | | - f.write(response.content) |
93 | | - logger.info(f"File {save_path} downloaded successfully") |
94 | | - else: |
95 | | - logger.info(f"Failed to download file. Status code: {response.status_code}") |
| 88 | + response = sess.get(url, headers=headers) |
| 89 | + response.raise_for_status() |
| 90 | + os.makedirs(os.path.dirname(save_path), exist_ok=True) |
| 91 | + with open(save_path, 'wb') as f: |
| 92 | + f.write(response.content) |
| 93 | + logger.info(f"File {save_path} downloaded successfully") |
96 | 94 |
|
97 | 95 |
|
98 | | -# download the tokenizer models |
99 | | -for model in models: |
| 96 | +def download_model(model): |
100 | 97 | name = model["name"] |
101 | 98 | repo = model["repo"] |
102 | 99 | tokt = model["tokt"] |
103 | 100 |
|
104 | | - if not os.path.exists(f"models/tokenizers/{name}"): |
105 | | - os.makedirs(f"models/tokenizers/{name}") |
106 | | - else: |
107 | | - logger.info(f"Directory models/tokenizers/{name} already exists - skipping") |
108 | | - continue |
109 | | - |
110 | | - logger.info(f"Downloading {name} to models/tokenizers/{name}") |
| 101 | + os.makedirs(f"models/tokenizers/{name}", exist_ok=True) |
111 | 102 |
|
112 | | - url = f"{repo}/raw/main/config.json" |
113 | | - save_path = f"models/tokenizers/{name}/config.json" |
114 | | - download_file_with_auth(url, token, save_path) |
| 103 | + files = ["config.json", "tokenizer.json", "tokenizer_config.json"] |
| 104 | + if tokt == TOKENIZER_TYPE.SPM: |
| 105 | + files.append("tokenizer.model") |
115 | 106 |
|
116 | | - url = f"{repo}/raw/main/tokenizer.json" |
117 | | - save_path = f"models/tokenizers/{name}/tokenizer.json" |
118 | | - download_file_with_auth(url, token, save_path) |
| 107 | + for file in files: |
| 108 | + save_path = f"models/tokenizers/{name}/{file}" |
| 109 | + if os.path.isfile(save_path): |
| 110 | + logger.info(f"{name}: File {save_path} already exists - skipping") |
| 111 | + continue |
| 112 | + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) |
119 | 113 |
|
120 | | - # if downloaded file is less than 1KB, we likely need to download an LFS instead |
121 | | - if os.path.getsize(save_path) < 1024: |
122 | | - # remove the file |
123 | | - os.remove(save_path) |
124 | | - url = f"{repo}/resolve/main/tokenizer.json" |
125 | | - save_path = f"models/tokenizers/{name}/tokenizer.json" |
126 | | - download_file_with_auth(url, token, save_path) |
127 | 114 |
|
128 | | - if tokt == TOKENIZER_TYPE.SPM: |
129 | | - url = f"{repo}/resolve/main/tokenizer.model" |
130 | | - save_path = f"models/tokenizers/{name}/tokenizer.model" |
131 | | - download_file_with_auth(url, token, save_path) |
| 115 | +for model in models: |
| 116 | + try: |
| 117 | + download_model(model) |
| 118 | + except Exception as e: |
| 119 | + logger.error(f"Failed to download model {model['name']}. Error: {e}") |
132 | 120 |
|
133 | | - url = f"{repo}/raw/main/tokenizer_config.json" |
134 | | - save_path = f"models/tokenizers/{name}/tokenizer_config.json" |
135 | | - download_file_with_auth(url, token, save_path) |
136 | 121 |
|
137 | 122 | # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: |
138 | | -# TODO: auto-update convert-hf-to-gguf.py with the generated function |
139 | 123 |
|
140 | 124 | src_ifs = "" |
141 | 125 | for model in models: |
@@ -224,11 +208,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: |
224 | 208 | return res |
225 | 209 | """ |
226 | 210 |
|
227 | | -print(src_func) # noqa: NP100 |
| 211 | +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") |
| 212 | +convert_py = convert_py_pth.read_text() |
| 213 | +convert_py = re.sub( |
| 214 | + r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", |
| 215 | + lambda m: m.group(1) + src_func + m.group(3), |
| 216 | + convert_py, |
| 217 | + flags=re.DOTALL | re.MULTILINE, |
| 218 | +) |
228 | 219 |
|
229 | | -logger.info("\n") |
230 | | -logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") |
231 | | -logger.info("\n") |
| 220 | +convert_py_pth.write_text(convert_py) |
| 221 | + |
| 222 | +logger.info("+++ convert-hf-to-gguf.py was updated") |
232 | 223 |
|
233 | 224 | # generate tests for each tokenizer model |
234 | 225 |
|
|
0 commit comments