Skip to content

Commit 2bf3fbf

Browse files
authored
ci : check that pre-tokenizer hashes are up-to-date (#15032)
* torch is not required for convert_hf_to_gguf_update * add --check-missing parameter * check that pre-tokenizer hashes are up-to-date
1 parent 711d5e6 commit 2bf3fbf

File tree

3 files changed

+60
-12
lines changed

3 files changed

+60
-12
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Check Pre-Tokenizer Hashes
2+
3+
on:
4+
push:
5+
paths:
6+
- 'convert_hf_to_gguf.py'
7+
- 'convert_hf_to_gguf_update.py'
8+
pull_request:
9+
paths:
10+
- 'convert_hf_to_gguf.py'
11+
- 'convert_hf_to_gguf_update.py'
12+
13+
jobs:
14+
pre-tokenizer-hashes:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
21+
- name: Set up Python
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: '3.11'
25+
26+
- name: Install Python dependencies
27+
run: |
28+
python3 -m venv .venv
29+
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
30+
31+
- name: Update pre-tokenizer hashes
32+
run: |
33+
cp convert_hf_to_gguf.py /tmp
34+
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
35+
36+
- name: Check if committed pre-tokenizer hashes matches generated version
37+
run: |
38+
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
39+
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
40+
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
41+
echo "Differences found:"
42+
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
43+
exit 1
44+
fi
45+
echo "Model pre-tokenizer hashes are up to date."

convert_hf_to_gguf_update.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ class TOKENIZER_TYPE(IntEnum):
5959
"--full", action="store_true",
6060
help="download full list of models - make sure you have access to all of them",
6161
)
62+
parser.add_argument(
63+
"--check-missing", action="store_true",
64+
help="only check for missing pre-tokenizer hashes",
65+
)
6266
parser.add_argument(
6367
"hf_token",
6468
help="optional HF token",
@@ -70,6 +74,10 @@ class TOKENIZER_TYPE(IntEnum):
7074
if hf_token is None:
7175
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
7276

77+
if args.check_missing and args.full:
78+
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
79+
args.check_missing = False
80+
7381
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
7482
# will be updated with time - contributions welcome
7583
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -222,12 +230,13 @@ def get_existing_models(convert_py):
222230
all_models = models.copy()
223231
models = [model for model in all_models if model["name"] not in existing_models]
224232

225-
logging.info(f"Downloading {len(models)} models...")
226-
for model in models:
227-
try:
228-
download_model(model)
229-
except Exception as e:
230-
logger.error(f"Failed to download model {model['name']}. Error: {e}")
233+
if not args.check_missing:
234+
logging.info(f"Downloading {len(models)} models...")
235+
for model in models:
236+
try:
237+
download_model(model)
238+
except Exception as e:
239+
logger.error(f"Failed to download model {model['name']}. Error: {e}")
231240

232241

233242
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1 @@
11
-r ./requirements-convert_legacy_llama.txt
2-
--extra-index-url https://download.pytorch.org/whl/cpu
3-
torch~=2.2.1; platform_machine != "s390x"
4-
5-
# torch s390x packages can only be found from nightly builds
6-
--extra-index-url https://download.pytorch.org/whl/nightly
7-
torch>=0.0.0.dev0; platform_machine == "s390x"

0 commit comments

Comments
 (0)