-
Notifications
You must be signed in to change notification settings - Fork 13.7k
convert: handle when model's tokenization method relies on Mecab #13830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
036b5d6
c484802
4e0f769
547b380
0192cab
94184ae
f256169
a6b9bde
a7fef9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,8 @@ | |
| # | ||
| # TODO: generate tokenizer tests for llama.cpp | ||
| # | ||
|
|
||
| import subprocess | ||
| import importlib.util | ||
| import logging | ||
| import os | ||
| import pathlib | ||
|
|
@@ -117,17 +118,47 @@ class TOKENIZER_TYPE(IntEnum): | |
| {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", }, | ||
| {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, | ||
| {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, | ||
| {"name": "ruri-large", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/cl-nagoya/ruri-large", }, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you add it here, you must also run the script so it updates
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and btw, do we even have the CPP code to handle this? is this already tested?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested that model and similar models (ruri-*) locally for embedding task and it worked.
I'm sorry. About this, like I said before, I don't have access to many models in the list, so it's hard to run all listed models to update to
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When #13847 is merged, you can run the script again and this time it will only process the newly added model |
||
| ] | ||
|
|
||
|
|
||
| def fugashi_check(): | ||
| """ | ||
| Check if fugashi and Japanese dictionary are installed and can be imported. | ||
| """ | ||
| try: | ||
| import fugashi | ||
CISC marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| tagger = fugashi.Tagger() | ||
CISC marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| except ImportError: | ||
| raise ImportError( | ||
| "fugashi is missing, install it via: pip install 'fugashi[unidic-lite]'" | ||
| ) | ||
| except Exception: | ||
| raise RuntimeError( | ||
| "fugashi is installed, but it might be missing the dictionary (e.g., unidic-lite).\n" | ||
| "Try installing via: pip install 'fugashi[unidic-lite]'\n" | ||
| ) | ||
|
|
||
|
|
||
| def download_file_with_auth(url, token, save_path): | ||
| headers = {"Authorization": f"Bearer {token}"} | ||
| response = sess.get(url, headers=headers) | ||
| response.raise_for_status() | ||
| os.makedirs(os.path.dirname(save_path), exist_ok=True) | ||
| with open(save_path, 'wb') as downloaded_file: | ||
| downloaded_file.write(response.content) | ||
| logger.info(f"File {save_path} downloaded successfully") | ||
| try: | ||
| response = sess.get(url, headers=headers) | ||
| response.raise_for_status() | ||
|
|
||
| os.makedirs(os.path.dirname(save_path), exist_ok=True) | ||
| with open(save_path, 'wb') as downloaded_file: | ||
| downloaded_file.write(response.content) | ||
| logger.info(f"File {save_path} downloaded successfully") | ||
| except requests.HTTPError as e: | ||
| if e.response.status_code == 404: | ||
| logger.warning(f"URL not found: {url}") | ||
| else: | ||
| logger.error(f"HTTP error occurred when downloading {url}: {e}") | ||
| except requests.ConnectionError: | ||
| logger.error(f"Connection error occurred when downloading {url}") | ||
| except Exception as e: | ||
| logger.error(f"Unexpected error occurred when downloading {url}: {e}") | ||
|
||
|
|
||
|
|
||
| def download_model(model): | ||
|
|
@@ -137,7 +168,7 @@ def download_model(model): | |
|
|
||
| os.makedirs(f"models/tokenizers/{name}", exist_ok=True) | ||
|
|
||
| files = ["config.json", "tokenizer.json", "tokenizer_config.json"] | ||
| files = ["config.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"] | ||
|
|
||
| if name == "gpt-4o": | ||
| # Xenova/gpt-4o is tokenizer-only, it does not contain config.json | ||
|
|
@@ -194,6 +225,15 @@ def download_model(model): | |
| logger.warning(f"Directory for tokenizer {name} not found. Skipping...") | ||
| continue | ||
|
|
||
| pre_tokenizer_log = True | ||
| if os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"): | ||
| with open(f"models/tokenizers/{name}/tokenizer_config.json", "r", encoding="utf-8") as f: | ||
| cfg = json.load(f) | ||
| if "word_tokenizer_type" in cfg and cfg["word_tokenizer_type"] == "mecab": | ||
| # Mecab need to be installed via fugashi | ||
| fugashi_check() | ||
| pre_tokenizer_log = False | ||
|
|
||
| # create the tokenizer | ||
| try: | ||
| if name == "t5": | ||
|
|
@@ -214,14 +254,15 @@ def download_model(model): | |
| logger.info(f"chkhsh: {chkhsh}") | ||
|
|
||
| # print the "pre_tokenizer" content from the tokenizer.json | ||
| with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: | ||
| cfg = json.load(f) | ||
| normalizer = cfg["normalizer"] | ||
| logger.info("normalizer: " + json.dumps(normalizer, indent=4)) | ||
| pre_tokenizer = cfg["pre_tokenizer"] | ||
| logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) | ||
| if "ignore_merges" in cfg["model"]: | ||
| logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)) | ||
| if pre_tokenizer_log: | ||
| with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: | ||
| cfg = json.load(f) | ||
| normalizer = cfg["normalizer"] | ||
| logger.info("normalizer: " + json.dumps(normalizer, indent=4)) | ||
| pre_tokenizer = cfg["pre_tokenizer"] | ||
| logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) | ||
| if "ignore_merges" in cfg["model"]: | ||
| logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)) | ||
|
|
||
| logger.info("") | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this can be removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed it