Skip to content

Commit 036b5d6

Browse files
committed
convert: add support for Japanese Bert model
1 parent a8ea03d commit 036b5d6

File tree

1 file changed

+56
-17
lines changed

1 file changed

+56
-17
lines changed

convert_hf_to_gguf_update.py

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
#
2323
# TODO: generate tokenizer tests for llama.cpp
2424
#
25-
25+
import subprocess
26+
import importlib.util
2627
import logging
2728
import os
2829
import pathlib
@@ -117,17 +118,47 @@ class TOKENIZER_TYPE(IntEnum):
117118
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118119
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
119120
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
121+
{"name": "ruri-large", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/cl-nagoya/ruri-large", },
120122
]
121123

122124

125+
def install_if_missing(package_spec: str, module_name: str = None):
126+
"""
127+
Installs the package via pip if the module cannot be imported.
128+
129+
Args:
130+
package_spec (str): The pip install spec, e.g., 'fugashi[unidic-lite]'.
131+
module_name (str): The module name to check via import. If None, uses the base name from package_spec.
132+
"""
133+
if module_name is None:
134+
module_name = package_spec.split("[")[0]
135+
136+
if importlib.util.find_spec(module_name) is None:
137+
print(f"Module '{module_name}' not found. Installing '{package_spec}'...")
138+
subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
139+
else:
140+
print(f"Module '{module_name}' is already installed.")
141+
142+
123143
def download_file_with_auth(url, token, save_path):
124144
headers = {"Authorization": f"Bearer {token}"}
125-
response = sess.get(url, headers=headers)
126-
response.raise_for_status()
127-
os.makedirs(os.path.dirname(save_path), exist_ok=True)
128-
with open(save_path, 'wb') as downloaded_file:
129-
downloaded_file.write(response.content)
130-
logger.info(f"File {save_path} downloaded successfully")
145+
try:
146+
response = sess.get(url, headers=headers)
147+
response.raise_for_status()
148+
149+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
150+
with open(save_path, 'wb') as downloaded_file:
151+
downloaded_file.write(response.content)
152+
logger.info(f"File {save_path} downloaded successfully")
153+
except requests.HTTPError as e:
154+
if e.response.status_code == 404:
155+
logger.warning(f"URL not found: {url}")
156+
else:
157+
logger.error(f"HTTP error occurred when downloading {url}: {e}")
158+
except requests.ConnectionError:
159+
logger.error(f"Connection error occurred when downloading {url}")
160+
except Exception as e:
161+
logger.error(f"Unexpected error occurred when downloading {url}: {e}")
131162

132163

133164
def download_model(model):
@@ -137,7 +168,7 @@ def download_model(model):
137168

138169
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
139170

140-
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
171+
files = ["config.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"]
141172

142173
if name == "gpt-4o":
143174
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
@@ -194,6 +225,13 @@ def download_model(model):
194225
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
195226
continue
196227

228+
if os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
229+
with open(f"models/tokenizers/{name}/tokenizer_config.json", "r", encoding="utf-8") as f:
230+
cfg = json.load(f)
231+
if "word_tokenizer_type" in cfg and cfg["word_tokenizer_type"] == "mecab":
232+
# Mecab need to be installed via fugashi
233+
install_if_missing("fugashi[unidic-lite]")
234+
197235
# create the tokenizer
198236
try:
199237
if name == "t5":
@@ -213,15 +251,16 @@ def download_model(model):
213251
logger.info(f"chktok: {chktok}")
214252
logger.info(f"chkhsh: {chkhsh}")
215253

216-
# print the "pre_tokenizer" content from the tokenizer.json
217-
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
218-
cfg = json.load(f)
219-
normalizer = cfg["normalizer"]
220-
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
221-
pre_tokenizer = cfg["pre_tokenizer"]
222-
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
223-
if "ignore_merges" in cfg["model"]:
224-
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
254+
# print the "pre_tokenizer" content from the tokenizer.json, if exists
255+
if os.path.isfile(f"models/tokenizers/{name}/tokenizer.json"):
256+
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
257+
cfg = json.load(f)
258+
normalizer = cfg["normalizer"]
259+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
260+
pre_tokenizer = cfg["pre_tokenizer"]
261+
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
262+
if "ignore_merges" in cfg["model"]:
263+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
225264

226265
logger.info("")
227266

0 commit comments

Comments
 (0)