ci : check that pre-tokenizer hashes are up-to-date (#15032)

CISC · web-flow · commit 2bf3fbf0b54f · 2025-08-02T14:39:01.000+02:00
* torch is not required for convert_hf_to_gguf_update

* add --check-missing parameter

* check that pre-tokenizer hashes are up-to-date
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
@@ -0,0 +1,45 @@
+name: Check Pre-Tokenizer Hashes
+
+on:
+    push:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+    pull_request:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+
+jobs:
+    pre-tokenizer-hashes:
+        runs-on: ubuntu-latest
+
+        steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+
+        - name: Set up Python
+          uses: actions/setup-python@v5
+          with:
+              python-version: '3.11'
+
+        - name: Install Python dependencies
+          run: |
+              python3 -m venv .venv
+              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+        - name: Update pre-tokenizer hashes
+          run: |
+              cp convert_hf_to_gguf.py /tmp
+              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+        - name: Check if committed pre-tokenizer hashes matches generated version
+          run: |
+              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+                  echo "Differences found:"
+                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  exit 1
+              fi
+              echo "Model pre-tokenizer hashes are up to date."
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -59,6 +59,10 @@ class TOKENIZER_TYPE(IntEnum):
     "--full", action="store_true",
     help="download full list of models - make sure you have access to all of them",
 )
+parser.add_argument(
+    "--check-missing", action="store_true",
+    help="only check for missing pre-tokenizer hashes",
+)
 parser.add_argument(
     "hf_token",
     help="optional HF token",
@@ -70,6 +74,10 @@ class TOKENIZER_TYPE(IntEnum):
 if hf_token is None:
     logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
 
+if args.check_missing and args.full:
+    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
+    args.check_missing = False
+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -222,12 +230,13 @@ def get_existing_models(convert_py):
     all_models = models.copy()
     models = [model for model in all_models if model["name"] not in existing_models]
 
-logging.info(f"Downloading {len(models)} models...")
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+if not args.check_missing:
+    logging.info(f"Downloading {len(models)} models...")
+    for model in models:
+        try:
+            download_model(model)
+        except Exception as e:
+            logger.error(f"Failed to download model {model['name']}. Error: {e}")
 
 
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,7 +1 @@
 -r ./requirements-convert_legacy_llama.txt
---extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
---extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"