11#!/usr/bin/env python3
22# -*- coding: utf-8 -*-
33
4- # This script downloads the tokenizer models of the specified models from Huggingface and
5- # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6- #
7- # This is necessary in order to analyze the type of pre-tokenizer used by the model and
8- # provide the necessary information to llama.cpp via the GGUF header in order to implement
9- # the same pre-tokenizer.
10- #
11- # ref: https://github.com/ggml-org/llama.cpp/pull/6920
12- #
13- # Instructions:
14- #
15- # - Add a new model to the "models" list
16- # - Run the script with your huggingface token:
17- #
18- # python3 convert_hf_to_gguf_update.py <huggingface_token>
19- #
20- # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21- # - Update llama.cpp with the new pre-tokenizer if necessary
22- #
23- # TODO: generate tokenizer tests for llama.cpp
24- #
25-
264import logging
275import os
286import pathlib
3210import sys
3311import json
3412import shutil
13+ import argparse
3514
3615from hashlib import sha256
3716from enum import IntEnum , auto
3817from transformers import AutoTokenizer
18+ from collections import OrderedDict
3919
4020logging .basicConfig (level = logging .DEBUG )
4121logger = logging .getLogger ("convert_hf_to_gguf_update" )
4222sess = requests .Session ()
4323
24+ convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
25+ convert_py = convert_py_pth .read_text (encoding = "utf-8" )
26+ hf_token_pth = pathlib .Path .home () / ".cache" / "huggingface" / "token"
27+ hf_token = hf_token_pth .read_text (encoding = "utf-8" ).strip () if hf_token_pth .exists () else None
28+
4429
4530class TOKENIZER_TYPE (IntEnum ):
4631 SPM = auto ()
@@ -49,20 +34,49 @@ class TOKENIZER_TYPE(IntEnum):
4934 UGM = auto ()
5035
5136
37+ DOC_STRING = """
38+ This script downloads the tokenizer models of the specified models from Huggingface and
39+ generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
40+
41+ /!\\ It is intended to be used by contributors and is not meant to be run by end users
42+
43+ This is necessary in order to analyze the type of pre-tokenizer used by the model and
44+ provide the necessary information to llama.cpp via the GGUF header in order to implement
45+ the same pre-tokenizer.
46+
47+ ref: https://github.com/ggml-org/llama.cpp/pull/6920
48+
49+ Instructions:
50+
51+ - Add a new model to the "models" list
52+ - Run the script with your huggingface token
53+ By default, token will be read from ~/.cache/huggingface/token
54+ - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
55+ - Update llama.cpp with the new pre-tokenizer if necessary
56+ """
57+ # TODO: generate tokenizer tests for llama.cpp
58+
59+ parser = argparse .ArgumentParser (description = DOC_STRING , formatter_class = argparse .RawTextHelpFormatter )
60+ parser .add_argument (
61+ "--full" , action = "store_true" ,
62+ help = "download full list of models - make sure you have access to all of them" ,
63+ )
64+ parser .add_argument (
65+ "hf_token" ,
66+ help = "optional HF token" ,
67+ nargs = "?" ,
68+ )
69+ args = parser .parse_args ()
70+ hf_token = args .hf_token if args .hf_token is not None else hf_token
71+
72+ if hf_token is None :
73+ logger .error ("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token" )
74+ sys .exit (1 )
75+
5276# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5377# will be updated with time - contributions welcome
5478CHK_TXT = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
5579
56- if len (sys .argv ) == 2 :
57- token = sys .argv [1 ]
58- if not token .startswith ("hf_" ):
59- logger .info ("Huggingface token seems invalid" )
60- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
61- sys .exit (1 )
62- else :
63- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
64- sys .exit (1 )
65-
6680# TODO: add models here, base models preferred
6781models = [
6882 {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
@@ -114,11 +128,19 @@ class TOKENIZER_TYPE(IntEnum):
114128 {"name" : "trillion" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/trillionlabs/Trillion-7B-preview" , },
115129 {"name" : "bailingmoe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/inclusionAI/Ling-lite" , },
116130 {"name" : "llama4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct" , },
131+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , },
117132 {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , },
118133 {"name" : "pixtral" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistral-community/pixtral-12b" , },
119134 {"name" : "seed-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base" , },
120135]
121136
137+ # some models are known to be broken upstream, so we will skip them as exceptions
138+ pre_computed_hashes = [
139+ # chatglm-bpe has 2 hashes, why?
140+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" },
141+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" },
142+ ]
143+
122144
123145def download_file_with_auth (url , token , save_path ):
124146 headers = {"Authorization" : f"Bearer { token } " }
@@ -169,9 +191,29 @@ def download_model(model):
169191 if os .path .isfile (save_path ):
170192 logger .info (f"{ name } : File { save_path } already exists - skipping" )
171193 continue
172- download_file_with_auth (f"{ repo } /resolve/main/{ file } " , token , save_path )
194+ download_file_with_auth (f"{ repo } /resolve/main/{ file } " , hf_token , save_path )
195+
173196
197+ # get list of existing models and chkhsh from the convert_hf_to_gguf.py file
198+ # returns mapping res --> chkhsh
199+ def get_existing_models (convert_py ):
200+ pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
201+ matches = re .findall (pattern , convert_py )
202+ output = OrderedDict () # make sure order is preserved
203+ for chkhsh , res in matches :
204+ output [res ] = chkhsh
205+ return output
174206
207+
208+ existing_models = {}
209+ all_models = models .copy ()
210+ if not args .full :
211+ # Filter out models that already exist in convert_hf_to_gguf.py
212+ existing_models = get_existing_models (convert_py )
213+ all_models = models .copy ()
214+ models = [model for model in all_models if model ["name" ] not in existing_models ]
215+
216+ print (f"Downloading { len (models )} models..." )
175217for model in models :
176218 try :
177219 download_model (model )
@@ -182,9 +224,10 @@ def download_model(model):
182224# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
183225
184226src_ifs = ""
185- for model in models :
227+ for model in [ * all_models , * pre_computed_hashes ] :
186228 name = model ["name" ]
187229 tokt = model ["tokt" ]
230+ chkhsh = model .get ("chkhsh" )
188231
189232 if tokt == TOKENIZER_TYPE .SPM or tokt == TOKENIZER_TYPE .UGM :
190233 continue
@@ -195,35 +238,43 @@ def download_model(model):
195238 continue
196239
197240 # create the tokenizer
198- try :
199- if name == "t5" :
200- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
201- else :
202- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
203- except OSError as e :
204- logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
205- continue # Skip to the next model if the tokenizer can't be loaded
206-
207- chktok = tokenizer .encode (CHK_TXT )
208- chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
209-
210- logger .info (f"model: { name } " )
211- logger .info (f"tokt: { tokt } " )
212- logger .info (f"repo: { model ['repo' ]} " )
213- logger .info (f"chktok: { chktok } " )
214- logger .info (f"chkhsh: { chkhsh } " )
215-
216- # print the "pre_tokenizer" content from the tokenizer.json
217- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
218- cfg = json .load (f )
219- normalizer = cfg ["normalizer" ]
220- logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
221- pre_tokenizer = cfg ["pre_tokenizer" ]
222- logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
223- if "ignore_merges" in cfg ["model" ]:
224- logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
225-
226- logger .info ("" )
241+ if chkhsh is not None :
242+ # if the model has a pre-computed hash, use it
243+ logger .info (f"Using pre-computed hash for model { name } : { chkhsh } " )
244+ elif name in existing_models :
245+ # if the model already exists in convert_hf_to_gguf.py, skip compute hash
246+ chkhsh = existing_models [name ]
247+ else :
248+ # otherwise, compute the hash of the tokenizer
249+ try :
250+ if name == "t5" :
251+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
252+ else :
253+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
254+ except OSError as e :
255+ logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
256+ continue # Skip to the next model if the tokenizer can't be loaded
257+
258+ chktok = tokenizer .encode (CHK_TXT )
259+ chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
260+
261+ logger .info (f"model: { name } " )
262+ logger .info (f"tokt: { tokt } " )
263+ logger .info (f"repo: { model ['repo' ]} " )
264+ logger .info (f"chktok: { chktok } " )
265+ logger .info (f"chkhsh: { chkhsh } " )
266+
267+ # print the "pre_tokenizer" content from the tokenizer.json
268+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
269+ cfg = json .load (f )
270+ normalizer = cfg ["normalizer" ]
271+ logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
272+ pre_tokenizer = cfg ["pre_tokenizer" ]
273+ logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
274+ if "ignore_merges" in cfg ["model" ]:
275+ logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
276+
277+ logger .info ("" )
227278
228279 src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
229280 src_ifs += f" # ref: { model ['repo' ]} \n "
@@ -271,8 +322,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
271322 return res
272323"""
273324
274- convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
275- convert_py = convert_py_pth .read_text (encoding = "utf-8" )
276325convert_py = re .sub (
277326 r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)" ,
278327 lambda m : m .group (1 ) + src_func + m .group (3 ),
0 commit comments