Skip to content

Commit 7697161

Browse files
committed
convert : allow partial update to the chkhsh pre-tokenizer list
1 parent a3c3084 commit 7697161

8 files changed

+600
-71
lines changed

convert_hf_to_gguf.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -674,12 +674,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
674674
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
675675
# ref: https://huggingface.co/tiiuae/falcon-7b
676676
res = "falcon"
677-
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
678-
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
679-
res = "falcon3"
680677
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
681678
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
682679
res = "bert-bge"
680+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
681+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
682+
res = "falcon3"
683683
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
684684
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
685685
res = "bert-bge-large"
@@ -731,9 +731,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
731731
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
732732
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
733733
res = "jina-v2-code"
734-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
735-
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
736-
res = "chatglm-bpe"
737734
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
738735
# ref: https://huggingface.co/LumiOpen/Viking-7B
739736
res = "viking"
@@ -797,6 +794,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
797794
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
798795
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
799796
res = "llama4"
797+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
798+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
799+
res = "chatglm-bpe"
800800
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
801801
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
802802
res = "glm4"
@@ -806,6 +806,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
806806
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
807807
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
808808
res = "seed-coder"
809+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
810+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
811+
res = "chatglm-bpe"
812+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
813+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
814+
res = "chatglm-bpe"
809815

810816
if res is None:
811817
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 114 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,6 @@
11
#!/usr/bin/env python3
22
# -*- coding: utf-8 -*-
33

4-
# This script downloads the tokenizer models of the specified models from Huggingface and
5-
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6-
#
7-
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
8-
# provide the necessary information to llama.cpp via the GGUF header in order to implement
9-
# the same pre-tokenizer.
10-
#
11-
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
12-
#
13-
# Instructions:
14-
#
15-
# - Add a new model to the "models" list
16-
# - Run the script with your huggingface token:
17-
#
18-
# python3 convert_hf_to_gguf_update.py <huggingface_token>
19-
#
20-
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21-
# - Update llama.cpp with the new pre-tokenizer if necessary
22-
#
23-
# TODO: generate tokenizer tests for llama.cpp
24-
#
25-
264
import logging
275
import os
286
import pathlib
@@ -32,15 +10,22 @@
3210
import sys
3311
import json
3412
import shutil
13+
import argparse
3514

3615
from hashlib import sha256
3716
from enum import IntEnum, auto
3817
from transformers import AutoTokenizer
18+
from collections import OrderedDict
3919

4020
logging.basicConfig(level=logging.DEBUG)
4121
logger = logging.getLogger("convert_hf_to_gguf_update")
4222
sess = requests.Session()
4323

24+
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
25+
convert_py = convert_py_pth.read_text(encoding="utf-8")
26+
hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
27+
hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
28+
4429

4530
class TOKENIZER_TYPE(IntEnum):
4631
SPM = auto()
@@ -49,20 +34,49 @@ class TOKENIZER_TYPE(IntEnum):
4934
UGM = auto()
5035

5136

37+
DOC_STRING = """
38+
This script downloads the tokenizer models of the specified models from Huggingface and
39+
generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
40+
41+
/!\\ It is intended to be used by contributors and is not meant to be run by end users
42+
43+
This is necessary in order to analyze the type of pre-tokenizer used by the model and
44+
provide the necessary information to llama.cpp via the GGUF header in order to implement
45+
the same pre-tokenizer.
46+
47+
ref: https://github.com/ggml-org/llama.cpp/pull/6920
48+
49+
Instructions:
50+
51+
- Add a new model to the "models" list
52+
- Run the script with your huggingface token
53+
By default, token will be read from ~/.cache/huggingface/token
54+
- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
55+
- Update llama.cpp with the new pre-tokenizer if necessary
56+
"""
57+
# TODO: generate tokenizer tests for llama.cpp
58+
59+
parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
60+
parser.add_argument(
61+
"--full", action="store_true",
62+
help="download full list of models - make sure you have access to all of them",
63+
)
64+
parser.add_argument(
65+
"hf_token",
66+
help="optional HF token",
67+
nargs="?",
68+
)
69+
args = parser.parse_args()
70+
hf_token = args.hf_token if args.hf_token is not None else hf_token
71+
72+
if hf_token is None:
73+
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
74+
sys.exit(1)
75+
5276
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5377
# will be updated with time - contributions welcome
5478
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
5579

56-
if len(sys.argv) == 2:
57-
token = sys.argv[1]
58-
if not token.startswith("hf_"):
59-
logger.info("Huggingface token seems invalid")
60-
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
61-
sys.exit(1)
62-
else:
63-
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
64-
sys.exit(1)
65-
6680
# TODO: add models here, base models preferred
6781
models = [
6882
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@@ -114,11 +128,19 @@ class TOKENIZER_TYPE(IntEnum):
114128
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115129
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
116130
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
131+
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", },
117132
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118133
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
119134
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
120135
]
121136

137+
# some models are known to be broken upstream, so we will skip them as exceptions
138+
pre_computed_hashes = [
139+
# chatglm-bpe has 2 hashes, why?
140+
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
141+
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
142+
]
143+
122144

123145
def download_file_with_auth(url, token, save_path):
124146
headers = {"Authorization": f"Bearer {token}"}
@@ -169,9 +191,29 @@ def download_model(model):
169191
if os.path.isfile(save_path):
170192
logger.info(f"{name}: File {save_path} already exists - skipping")
171193
continue
172-
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
194+
download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
195+
173196

197+
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
198+
# returns mapping res --> chkhsh
199+
def get_existing_models(convert_py):
200+
pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
201+
matches = re.findall(pattern, convert_py)
202+
output = OrderedDict() # make sure order is preserved
203+
for chkhsh, res in matches:
204+
output[res] = chkhsh
205+
return output
174206

207+
208+
existing_models = {}
209+
all_models = models.copy()
210+
if not args.full:
211+
# Filter out models that already exist in convert_hf_to_gguf.py
212+
existing_models = get_existing_models(convert_py)
213+
all_models = models.copy()
214+
models = [model for model in all_models if model["name"] not in existing_models]
215+
216+
print(f"Downloading {len(models)} models...")
175217
for model in models:
176218
try:
177219
download_model(model)
@@ -182,9 +224,10 @@ def download_model(model):
182224
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
183225

184226
src_ifs = ""
185-
for model in models:
227+
for model in [*all_models, *pre_computed_hashes]:
186228
name = model["name"]
187229
tokt = model["tokt"]
230+
chkhsh = model.get("chkhsh")
188231

189232
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
190233
continue
@@ -195,35 +238,43 @@ def download_model(model):
195238
continue
196239

197240
# create the tokenizer
198-
try:
199-
if name == "t5":
200-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
201-
else:
202-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
203-
except OSError as e:
204-
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
205-
continue # Skip to the next model if the tokenizer can't be loaded
206-
207-
chktok = tokenizer.encode(CHK_TXT)
208-
chkhsh = sha256(str(chktok).encode()).hexdigest()
209-
210-
logger.info(f"model: {name}")
211-
logger.info(f"tokt: {tokt}")
212-
logger.info(f"repo: {model['repo']}")
213-
logger.info(f"chktok: {chktok}")
214-
logger.info(f"chkhsh: {chkhsh}")
215-
216-
# print the "pre_tokenizer" content from the tokenizer.json
217-
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
218-
cfg = json.load(f)
219-
normalizer = cfg["normalizer"]
220-
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
221-
pre_tokenizer = cfg["pre_tokenizer"]
222-
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
223-
if "ignore_merges" in cfg["model"]:
224-
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
225-
226-
logger.info("")
241+
if chkhsh is not None:
242+
# if the model has a pre-computed hash, use it
243+
logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
244+
elif name in existing_models:
245+
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
246+
chkhsh = existing_models[name]
247+
else:
248+
# otherwise, compute the hash of the tokenizer
249+
try:
250+
if name == "t5":
251+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
252+
else:
253+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
254+
except OSError as e:
255+
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
256+
continue # Skip to the next model if the tokenizer can't be loaded
257+
258+
chktok = tokenizer.encode(CHK_TXT)
259+
chkhsh = sha256(str(chktok).encode()).hexdigest()
260+
261+
logger.info(f"model: {name}")
262+
logger.info(f"tokt: {tokt}")
263+
logger.info(f"repo: {model['repo']}")
264+
logger.info(f"chktok: {chktok}")
265+
logger.info(f"chkhsh: {chkhsh}")
266+
267+
# print the "pre_tokenizer" content from the tokenizer.json
268+
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
269+
cfg = json.load(f)
270+
normalizer = cfg["normalizer"]
271+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
272+
pre_tokenizer = cfg["pre_tokenizer"]
273+
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
274+
if "ignore_merges" in cfg["model"]:
275+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
276+
277+
logger.info("")
227278

228279
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
229280
src_ifs += f" # ref: {model['repo']}\n"
@@ -271,8 +322,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
271322
return res
272323
"""
273324

274-
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
275-
convert_py = convert_py_pth.read_text(encoding="utf-8")
276325
convert_py = re.sub(
277326
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
278327
lambda m: m.group(1) + src_func + m.group(3),

0 commit comments

Comments
 (0)