11#!/usr/bin/env python3
22# -*- coding: utf-8 -*-
33
4- # This script downloads the tokenizer models of the specified models from Huggingface and
5- # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6- #
7- # This is necessary in order to analyze the type of pre-tokenizer used by the model and
8- # provide the necessary information to llama.cpp via the GGUF header in order to implement
9- # the same pre-tokenizer.
10- #
11- # ref: https://github.com/ggml-org/llama.cpp/pull/6920
12- #
13- # Instructions:
14- #
15- # - Add a new model to the "models" list
16- # - Run the script with your huggingface token:
17- #
18- # python3 convert_hf_to_gguf_update.py <huggingface_token>
19- #
20- # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21- # - Update llama.cpp with the new pre-tokenizer if necessary
22- #
23- # TODO: generate tokenizer tests for llama.cpp
24- #
25-
264import logging
275import os
286import pathlib
3210import sys
3311import json
3412import shutil
13+ import argparse
3514
3615from hashlib import sha256
3716from enum import IntEnum , auto
4120logger = logging .getLogger ("convert_hf_to_gguf_update" )
4221sess = requests .Session ()
4322
23+ convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
24+ convert_py = convert_py_pth .read_text (encoding = "utf-8" )
25+ hf_token_pth = pathlib .Path .home () / ".cache" / "huggingface" / "token"
26+ hf_token = hf_token_pth .read_text (encoding = "utf-8" ).strip () if hf_token_pth .exists () else None
27+
4428
4529class TOKENIZER_TYPE (IntEnum ):
4630 SPM = auto ()
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
4933 UGM = auto ()
5034
5135
36+ DOC_STRING = """
37+ This script downloads the tokenizer models of the specified models from Huggingface and
38+ generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
39+
40+ /!\\ It is intended to be used by contributors and is not meant to be run by end users
41+
42+ This is necessary in order to analyze the type of pre-tokenizer used by the model and
43+ provide the necessary information to llama.cpp via the GGUF header in order to implement
44+ the same pre-tokenizer.
45+
46+ ref: https://github.com/ggml-org/llama.cpp/pull/6920
47+
48+ Instructions:
49+
50+ - Add a new model to the "models" list
51+ - Run the script with your huggingface token
52+ By default, token will be read from ~/.cache/huggingface/token
53+ - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
54+ - Update llama.cpp with the new pre-tokenizer if necessary
55+ """
56+ # TODO: generate tokenizer tests for llama.cpp
57+
58+ parser = argparse .ArgumentParser (description = DOC_STRING , formatter_class = argparse .RawTextHelpFormatter )
59+ parser .add_argument (
60+ "--full" , action = "store_true" ,
61+ help = "download full list of models - make sure you have access to all of them" ,
62+ )
63+ parser .add_argument (
64+ "hf_token" ,
65+ help = "optional HF token" ,
66+ nargs = "?" ,
67+ )
68+ args = parser .parse_args ()
69+ hf_token = args .hf_token if args .hf_token is not None else hf_token
70+
71+ if hf_token is None :
72+ logger .error ("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token" )
73+ sys .exit (1 )
74+
5275# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5376# will be updated with time - contributions welcome
5477CHK_TXT = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
5578
56- if len (sys .argv ) == 2 :
57- token = sys .argv [1 ]
58- if not token .startswith ("hf_" ):
59- logger .info ("Huggingface token seems invalid" )
60- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
61- sys .exit (1 )
62- else :
63- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
64- sys .exit (1 )
65-
6679# TODO: add models here, base models preferred
6780models = [
6881 {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
@@ -103,7 +116,6 @@ class TOKENIZER_TYPE(IntEnum):
103116 {"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104117 {"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105118 {"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106- {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107119 {"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108120 {"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109121 {"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
@@ -114,11 +126,19 @@ class TOKENIZER_TYPE(IntEnum):
114126 {"name" : "trillion" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/trillionlabs/Trillion-7B-preview" , },
115127 {"name" : "bailingmoe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/inclusionAI/Ling-lite" , },
116128 {"name" : "llama4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct" , },
117- {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , },
118129 {"name" : "pixtral" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistral-community/pixtral-12b" , },
119130 {"name" : "seed-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base" , },
120131]
121132
133+ # some models are known to be broken upstream, so we will skip them as exceptions
134+ pre_computed_hashes = [
135+ # chatglm-bpe has 2 hashes, why?
136+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" },
137+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" },
138+ {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , "chkhsh" : "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" },
139+ {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , "chkhsh" : "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" },
140+ ]
141+
122142
123143def download_file_with_auth (url , token , save_path ):
124144 headers = {"Authorization" : f"Bearer { token } " }
@@ -169,9 +189,29 @@ def download_model(model):
169189 if os .path .isfile (save_path ):
170190 logger .info (f"{ name } : File { save_path } already exists - skipping" )
171191 continue
172- download_file_with_auth (f"{ repo } /resolve/main/{ file } " , token , save_path )
192+ download_file_with_auth (f"{ repo } /resolve/main/{ file } " , hf_token , save_path )
193+
194+
195+ # get list of existing models and chkhsh from the convert_hf_to_gguf.py file
196+ # returns mapping res --> chkhsh
197+ def get_existing_models (convert_py ):
198+ pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
199+ matches = re .findall (pattern , convert_py )
200+ output = {}
201+ for chkhsh , res in matches :
202+ output [res ] = chkhsh
203+ return output
204+
173205
206+ existing_models = {}
207+ all_models = models .copy ()
208+ if not args .full :
209+ # Filter out models that already exist in convert_hf_to_gguf.py
210+ existing_models = get_existing_models (convert_py )
211+ all_models = models .copy ()
212+ models = [model for model in all_models if model ["name" ] not in existing_models ]
174213
214+ logging .info (f"Downloading { len (models )} models..." )
175215for model in models :
176216 try :
177217 download_model (model )
@@ -182,9 +222,10 @@ def download_model(model):
182222# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
183223
184224src_ifs = ""
185- for model in models :
225+ for model in [ * all_models , * pre_computed_hashes ] :
186226 name = model ["name" ]
187227 tokt = model ["tokt" ]
228+ chkhsh = model .get ("chkhsh" )
188229
189230 if tokt == TOKENIZER_TYPE .SPM or tokt == TOKENIZER_TYPE .UGM :
190231 continue
@@ -195,35 +236,44 @@ def download_model(model):
195236 continue
196237
197238 # create the tokenizer
198- try :
199- if name == "t5" :
200- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
201- else :
202- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
203- except OSError as e :
204- logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
205- continue # Skip to the next model if the tokenizer can't be loaded
206-
207- chktok = tokenizer .encode (CHK_TXT )
208- chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
209-
210- logger .info (f"model: { name } " )
211- logger .info (f"tokt: { tokt } " )
212- logger .info (f"repo: { model ['repo' ]} " )
213- logger .info (f"chktok: { chktok } " )
214- logger .info (f"chkhsh: { chkhsh } " )
215-
216- # print the "pre_tokenizer" content from the tokenizer.json
217- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
218- cfg = json .load (f )
219- normalizer = cfg ["normalizer" ]
220- logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
221- pre_tokenizer = cfg ["pre_tokenizer" ]
222- logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
223- if "ignore_merges" in cfg ["model" ]:
224- logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
225-
226- logger .info ("" )
239+ if chkhsh is not None :
240+ # if the model has a pre-computed hash, use it
241+ logger .info (f"Using pre-computed hash for model { name } : { chkhsh } " )
242+ elif name in existing_models :
243+ # if the model already exists in convert_hf_to_gguf.py, skip compute hash
244+ chkhsh = existing_models [name ]
245+ else :
246+ # otherwise, compute the hash of the tokenizer
247+ try :
248+ logger .info (f"Loading tokenizer from { f'models/tokenizers/{ name } ' } ..." )
249+ if name == "t5" :
250+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
251+ else :
252+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
253+ except OSError as e :
254+ logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
255+ continue # Skip to the next model if the tokenizer can't be loaded
256+
257+ chktok = tokenizer .encode (CHK_TXT )
258+ chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
259+
260+ logger .info (f"model: { name } " )
261+ logger .info (f"tokt: { tokt } " )
262+ logger .info (f"repo: { model ['repo' ]} " )
263+ logger .info (f"chktok: { chktok } " )
264+ logger .info (f"chkhsh: { chkhsh } " )
265+
266+ # print the "pre_tokenizer" content from the tokenizer.json
267+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
268+ cfg = json .load (f )
269+ normalizer = cfg ["normalizer" ]
270+ logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
271+ pre_tokenizer = cfg ["pre_tokenizer" ]
272+ logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
273+ if "ignore_merges" in cfg ["model" ]:
274+ logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
275+
276+ logger .info ("" )
227277
228278 src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
229279 src_ifs += f" # ref: { model ['repo' ]} \n "
@@ -271,8 +321,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
271321 return res
272322"""
273323
274- convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
275- convert_py = convert_py_pth .read_text (encoding = "utf-8" )
276324convert_py = re .sub (
277325 r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)" ,
278326 lambda m : m .group (1 ) + src_func + m .group (3 ),
@@ -367,6 +415,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
367415 logger .error (f"Failed to load tokenizer for model { name } . Error: { e } " )
368416 continue # Skip this model and continue with the next one in the loop
369417
418+ if not os .path .exists (f"models/ggml-vocab-{ name } .gguf" ):
419+ logger .info (f"Skip vocab files for model { name } , no GGUF file found" )
420+ continue
421+
370422 with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
371423 for text in tests :
372424 f .write (f"{ text } " )
0 commit comments