convert : make hf token optional (ggml-org#14717)

CISC · web-flow · commit 19e5943d9e97 · 2025-07-16T23:17:43.000+02:00
* make hf token optional

* fail if we can't get necessary tokenizer config
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -7,7 +7,6 @@
 import re
 
 import requests
-import sys
 import json
 import shutil
 import argparse
@@ -69,8 +68,7 @@ class TOKENIZER_TYPE(IntEnum):
 hf_token = args.hf_token if args.hf_token is not None else hf_token
 
 if hf_token is None:
-    logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
-    sys.exit(1)
+    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
 
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
@@ -151,7 +149,7 @@ class TOKENIZER_TYPE(IntEnum):
 
 
 def download_file_with_auth(url, token, save_path):
-    headers = {"Authorization": f"Bearer {token}"}
+    headers = {"Authorization": f"Bearer {token}"} if token else None
     response = sess.get(url, headers=headers)
     response.raise_for_status()
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -250,20 +248,18 @@ def get_existing_models(convert_py):
     else:
         # otherwise, compute the hash of the tokenizer
 
-        # Skip if the tokenizer folder does not exist or there are other download issues previously
-        if not os.path.exists(f"models/tokenizers/{name}"):
-            logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-            continue
+        # Fail if the tokenizer folder with config does not exist or there are other download issues previously
+        if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
+            raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
 
         try:
             logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
             if name == "t5":
                 tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
             else:
                 tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-        except OSError as e:
-            logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-            continue  # Skip to the next model if the tokenizer can't be loaded
+        except Exception as e:
+            raise OSError(f"Error loading tokenizer for model {name}.") from e
 
         chktok = tokenizer.encode(CHK_TXT)
         chkhsh = sha256(str(chktok).encode()).hexdigest()