|
7 | 7 | import re |
8 | 8 |
|
9 | 9 | import requests |
10 | | -import sys |
11 | 10 | import json |
12 | 11 | import shutil |
13 | 12 | import argparse |
@@ -69,8 +68,7 @@ class TOKENIZER_TYPE(IntEnum): |
69 | 68 | hf_token = args.hf_token if args.hf_token is not None else hf_token |
70 | 69 |
|
71 | 70 | if hf_token is None: |
72 | | - logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token") |
73 | | - sys.exit(1) |
| 71 | + logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token") |
74 | 72 |
|
75 | 73 | # TODO: this string has to exercise as much pre-tokenizer functionality as possible |
76 | 74 | # will be updated with time - contributions welcome |
@@ -151,7 +149,7 @@ class TOKENIZER_TYPE(IntEnum): |
151 | 149 |
|
152 | 150 |
|
153 | 151 | def download_file_with_auth(url, token, save_path): |
154 | | - headers = {"Authorization": f"Bearer {token}"} |
| 152 | + headers = {"Authorization": f"Bearer {token}"} if token else None |
155 | 153 | response = sess.get(url, headers=headers) |
156 | 154 | response.raise_for_status() |
157 | 155 | os.makedirs(os.path.dirname(save_path), exist_ok=True) |
@@ -250,20 +248,18 @@ def get_existing_models(convert_py): |
250 | 248 | else: |
251 | 249 | # otherwise, compute the hash of the tokenizer |
252 | 250 |
|
253 | | - # Skip if the tokenizer folder does not exist or there are other download issues previously |
254 | | - if not os.path.exists(f"models/tokenizers/{name}"): |
255 | | - logger.warning(f"Directory for tokenizer {name} not found. Skipping...") |
256 | | - continue |
| 251 | + # Fail if the tokenizer folder with config does not exist or there are other download issues previously |
| 252 | + if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"): |
| 253 | + raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.") |
257 | 254 |
|
258 | 255 | try: |
259 | 256 | logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...") |
260 | 257 | if name == "t5": |
261 | 258 | tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) |
262 | 259 | else: |
263 | 260 | tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") |
264 | | - except OSError as e: |
265 | | - logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") |
266 | | - continue # Skip to the next model if the tokenizer can't be loaded |
| 261 | + except Exception as e: |
| 262 | + raise OSError(f"Error loading tokenizer for model {name}.") from e |
267 | 263 |
|
268 | 264 | chktok = tokenizer.encode(CHK_TXT) |
269 | 265 | chkhsh = sha256(str(chktok).encode()).hexdigest() |
|
0 commit comments