Skip to content
73 changes: 57 additions & 16 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
#
# TODO: generate tokenizer tests for llama.cpp
#

import subprocess
import importlib.util
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it

import logging
import os
import pathlib
Expand Down Expand Up @@ -117,17 +118,47 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "ruri-large", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/cl-nagoya/ruri-large", },
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you add it here, you must also run the script so it updates convert_hf_to_gguf and include the change in this PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and btw, do we even have the CPP code to handle this? is this already tested?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested that model and similar models (ruri-*) locally for embedding task and it worked.

If you add it here, you must also run the script so it updates convert_hf_to_gguf and include the change in this PR

I'm sorry. About this, like I said before, I don't have access to many models in the list, so it's hard to run all listed models to update to convert_hf_to_gguf. Can you do that for me? If not, how do you think we can handle this (Like left a comment telling that some Japanese models require vocab.txt)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When #13847 is merged, you can run the script again and this time it will only process the newly added model

]


def fugashi_check():
"""
Check if fugashi and Japanese dictionary are installed and can be imported.
"""
try:
import fugashi
tagger = fugashi.Tagger()
except ImportError:
raise ImportError(
"fugashi is missing, install it via: pip install 'fugashi[unidic-lite]'"
)
except Exception:
raise RuntimeError(
"fugashi is installed, but it might be missing the dictionary (e.g., unidic-lite).\n"
"Try installing via: pip install 'fugashi[unidic-lite]'\n"
)


def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
try:
response = sess.get(url, headers=headers)
response.raise_for_status()

os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
except requests.HTTPError as e:
if e.response.status_code == 404:
logger.warning(f"URL not found: {url}")
else:
logger.error(f"HTTP error occurred when downloading {url}: {e}")
except requests.ConnectionError:
logger.error(f"Connection error occurred when downloading {url}")
except Exception as e:
logger.error(f"Unexpected error occurred when downloading {url}: {e}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. This whole multiple except can be just one single except Exception as e. No need to over-engineer the error handling if you only interested in logging it
  2. The old code doesn't have this handling, so it will simply terminate the script if there an error. Now with this, error will be ignored. I think this is not the expected behavior

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I don't have access to many models in the list, so the script will be terminated everytime I run it (without commenting other models). The instructions at the beginning of the file state that Add a new model to the "models" list, which may make users confused

What is your suggestion about this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually just temporary comment out all the other models then run the script. But yes having the ability to update only added model will be a better approach. I will add it in another PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, let's simply remove this change from this PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this change.

I will add it in another PR

Thank you in advance!



def download_model(model):
Expand All @@ -137,7 +168,7 @@ def download_model(model):

os.makedirs(f"models/tokenizers/{name}", exist_ok=True)

files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
files = ["config.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"]

if name == "gpt-4o":
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
Expand Down Expand Up @@ -194,6 +225,15 @@ def download_model(model):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue

pre_tokenizer_log = True
if os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
with open(f"models/tokenizers/{name}/tokenizer_config.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
if "word_tokenizer_type" in cfg and cfg["word_tokenizer_type"] == "mecab":
# Mecab need to be installed via fugashi
fugashi_check()
pre_tokenizer_log = False

# create the tokenizer
try:
if name == "t5":
Expand All @@ -214,14 +254,15 @@ def download_model(model):
logger.info(f"chkhsh: {chkhsh}")

# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
if pre_tokenizer_log:
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))

logger.info("")

Expand Down