2222#
2323# TODO: generate tokenizer tests for llama.cpp
2424#
25-
25+ import subprocess
26+ import importlib .util
2627import logging
2728import os
2829import pathlib
@@ -117,17 +118,47 @@ class TOKENIZER_TYPE(IntEnum):
117118 {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , },
118119 {"name" : "pixtral" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistral-community/pixtral-12b" , },
119120 {"name" : "seed-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base" , },
121+ {"name" : "ruri-large" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/cl-nagoya/ruri-large" , },
120122]
121123
122124
125+ def install_if_missing (package_spec : str , module_name : str = None ):
126+ """
127+ Installs the package via pip if the module cannot be imported.
128+
129+ Args:
130+ package_spec (str): The pip install spec, e.g., 'fugashi[unidic-lite]'.
131+ module_name (str): The module name to check via import. If None, uses the base name from package_spec.
132+ """
133+ if module_name is None :
134+ module_name = package_spec .split ("[" )[0 ]
135+
136+ if importlib .util .find_spec (module_name ) is None :
137+ print (f"Module '{ module_name } ' not found. Installing '{ package_spec } '..." )
138+ subprocess .check_call ([sys .executable , "-m" , "pip" , "install" , package_spec ])
139+ else :
140+ print (f"Module '{ module_name } ' is already installed." )
141+
142+
123143def download_file_with_auth (url , token , save_path ):
124144 headers = {"Authorization" : f"Bearer { token } " }
125- response = sess .get (url , headers = headers )
126- response .raise_for_status ()
127- os .makedirs (os .path .dirname (save_path ), exist_ok = True )
128- with open (save_path , 'wb' ) as downloaded_file :
129- downloaded_file .write (response .content )
130- logger .info (f"File { save_path } downloaded successfully" )
145+ try :
146+ response = sess .get (url , headers = headers )
147+ response .raise_for_status ()
148+
149+ os .makedirs (os .path .dirname (save_path ), exist_ok = True )
150+ with open (save_path , 'wb' ) as downloaded_file :
151+ downloaded_file .write (response .content )
152+ logger .info (f"File { save_path } downloaded successfully" )
153+ except requests .HTTPError as e :
154+ if e .response .status_code == 404 :
155+ logger .warning (f"URL not found: { url } " )
156+ else :
157+ logger .error (f"HTTP error occurred when downloading { url } : { e } " )
158+ except requests .ConnectionError :
159+ logger .error (f"Connection error occurred when downloading { url } " )
160+ except Exception as e :
161+ logger .error (f"Unexpected error occurred when downloading { url } : { e } " )
131162
132163
133164def download_model (model ):
@@ -137,7 +168,7 @@ def download_model(model):
137168
138169 os .makedirs (f"models/tokenizers/{ name } " , exist_ok = True )
139170
140- files = ["config.json" , "tokenizer.json" , "tokenizer_config.json" ]
171+ files = ["config.json" , "tokenizer.json" , "tokenizer_config.json" , "vocab.txt" ]
141172
142173 if name == "gpt-4o" :
143174 # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
@@ -194,6 +225,13 @@ def download_model(model):
194225 logger .warning (f"Directory for tokenizer { name } not found. Skipping..." )
195226 continue
196227
228+ if os .path .isfile (f"models/tokenizers/{ name } /tokenizer_config.json" ):
229+ with open (f"models/tokenizers/{ name } /tokenizer_config.json" , "r" , encoding = "utf-8" ) as f :
230+ cfg = json .load (f )
231+ if "word_tokenizer_type" in cfg and cfg ["word_tokenizer_type" ] == "mecab" :
232+ # Mecab need to be installed via fugashi
233+ install_if_missing ("fugashi[unidic-lite]" )
234+
197235 # create the tokenizer
198236 try :
199237 if name == "t5" :
@@ -213,15 +251,16 @@ def download_model(model):
213251 logger .info (f"chktok: { chktok } " )
214252 logger .info (f"chkhsh: { chkhsh } " )
215253
216- # print the "pre_tokenizer" content from the tokenizer.json
217- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
218- cfg = json .load (f )
219- normalizer = cfg ["normalizer" ]
220- logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
221- pre_tokenizer = cfg ["pre_tokenizer" ]
222- logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
223- if "ignore_merges" in cfg ["model" ]:
224- logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
254+ # print the "pre_tokenizer" content from the tokenizer.json, if exists
255+ if os .path .isfile (f"models/tokenizers/{ name } /tokenizer.json" ):
256+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
257+ cfg = json .load (f )
258+ normalizer = cfg ["normalizer" ]
259+ logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
260+ pre_tokenizer = cfg ["pre_tokenizer" ]
261+ logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
262+ if "ignore_merges" in cfg ["model" ]:
263+ logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
225264
226265 logger .info ("" )
227266
0 commit comments