Skip to content
73 changes: 56 additions & 17 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
#
# TODO: generate tokenizer tests for llama.cpp
#

import subprocess
import importlib.util
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it

import logging
import os
import pathlib
Expand Down Expand Up @@ -117,17 +118,47 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "ruri-large", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/cl-nagoya/ruri-large", },
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you add it here, you must also run the script so it updates convert_hf_to_gguf and include the change in this PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and btw, do we even have the CPP code to handle this? is this already tested?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested that model and similar models (ruri-*) locally for embedding task and it worked.

If you add it here, you must also run the script so it updates convert_hf_to_gguf and include the change in this PR

I'm sorry. About this, like I said before, I don't have access to many models in the list, so it's hard to run all listed models to update to convert_hf_to_gguf. Can you do that for me? If not, how do you think we can handle this (Like left a comment telling that some Japanese models require vocab.txt)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When #13847 is merged, you can run the script again and this time it will only process the newly added model

]


def install_if_missing(package_spec: str, module_name: str = None):
"""
Installs the package via pip if the module cannot be imported.

Args:
package_spec (str): The pip install spec, e.g., 'fugashi[unidic-lite]'.
module_name (str): The module name to check via import. If None, uses the base name from package_spec.
"""
if module_name is None:
module_name = package_spec.split("[")[0]

if importlib.util.find_spec(module_name) is None:
print(f"Module '{module_name}' not found. Installing '{package_spec}'...")
subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec])
else:
print(f"Module '{module_name}' is already installed.")


def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
try:
response = sess.get(url, headers=headers)
response.raise_for_status()

os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
except requests.HTTPError as e:
if e.response.status_code == 404:
logger.warning(f"URL not found: {url}")
else:
logger.error(f"HTTP error occurred when downloading {url}: {e}")
except requests.ConnectionError:
logger.error(f"Connection error occurred when downloading {url}")
except Exception as e:
logger.error(f"Unexpected error occurred when downloading {url}: {e}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. This whole multiple except can be just one single except Exception as e. No need to over-engineer the error handling if you only interested in logging it
  2. The old code doesn't have this handling, so it will simply terminate the script if there an error. Now with this, error will be ignored. I think this is not the expected behavior

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I don't have access to many models in the list, so the script will be terminated everytime I run it (without commenting other models). The instructions at the beginning of the file state that Add a new model to the "models" list, which may make users confused

What is your suggestion about this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually just temporary comment out all the other models then run the script. But yes having the ability to update only added model will be a better approach. I will add it in another PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, let's simply remove this change from this PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this change.

I will add it in another PR

Thank you in advance!



def download_model(model):
Expand All @@ -137,7 +168,7 @@ def download_model(model):

os.makedirs(f"models/tokenizers/{name}", exist_ok=True)

files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
files = ["config.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"]

if name == "gpt-4o":
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
Expand Down Expand Up @@ -194,6 +225,13 @@ def download_model(model):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue

if os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
with open(f"models/tokenizers/{name}/tokenizer_config.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
if "word_tokenizer_type" in cfg and cfg["word_tokenizer_type"] == "mecab":
# Mecab need to be installed via fugashi
install_if_missing("fugashi[unidic-lite]")

# create the tokenizer
try:
if name == "t5":
Expand All @@ -213,15 +251,16 @@ def download_model(model):
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")

# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
# print the "pre_tokenizer" content from the tokenizer.json, if exists
if os.path.isfile(f"models/tokenizers/{name}/tokenizer.json"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will alter the behavior of other models

instead, check for cfg["word_tokenizer_type"] == "mecab" and only skip this on that particular model

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry. I just fixed that

with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))

logger.info("")

Expand Down