diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ec3b5697d8f6f..7b557702b1806 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -809,6 +809,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" + if chkhsh == "9286a8bdaef9f09da63eae001d8dca3e8b4dcfebfe468807c0c87a831a4a1901": + # ref: https://huggingface.co/cl-nagoya/ruri-large + res = "ruri-large" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2f733f0973686..f43e104f5a089 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, + {"name": "ruri-large", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/cl-nagoya/ruri-large", }, ] # some models are known to be broken upstream, so we will skip them as exceptions @@ -140,6 +141,24 @@ class TOKENIZER_TYPE(IntEnum): ] +def fugashi_check(): + """ + Check if fugashi and Japanese dictionary are installed and can be imported. + """ + try: + import fugashi # pyright: ignore[reportMissingImports] + fugashi.Tagger() + except ImportError: + raise ImportError( + "fugashi is missing, install it via: pip install 'fugashi[unidic-lite]'" + ) + except Exception: + raise RuntimeError( + "fugashi is installed, but it might be missing the dictionary (e.g., unidic-lite).\n" + "Try installing via: pip install 'fugashi[unidic-lite]'\n" + ) + + def download_file_with_auth(url, token, save_path): headers = {"Authorization": f"Bearer {token}"} response = sess.get(url, headers=headers) @@ -163,6 +182,9 @@ def download_model(model): # Xenova/gpt-4o is tokenizer-only, it does not contain config.json files = ["tokenizer.json", "tokenizer_config.json"] + if name == "ruri-large": + files = ["config.json", "tokenizer_config.json", "vocab.txt"] + if tokt == TOKENIZER_TYPE.SPM: files.append("tokenizer.model") @@ -235,6 +257,15 @@ def get_existing_models(convert_py): logger.warning(f"Directory for tokenizer {name} not found. Skipping...") continue + pre_tokenizer_log = True + if os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"): + with open(f"models/tokenizers/{name}/tokenizer_config.json", "r", encoding="utf-8") as f: + cfg = json.load(f) + if "word_tokenizer_type" in cfg and cfg["word_tokenizer_type"] == "mecab": + # Mecab need to be installed via fugashi + fugashi_check() + pre_tokenizer_log = False + # create the tokenizer if chkhsh is not None: # if the model has a pre-computed hash, use it @@ -263,7 +294,8 @@ def get_existing_models(convert_py): logger.info(f"chktok: {chktok}") logger.info(f"chkhsh: {chkhsh}") - # print the "pre_tokenizer" content from the tokenizer.json + # print the "pre_tokenizer" content from the tokenizer.json + if pre_tokenizer_log: with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: cfg = json.load(f) normalizer = cfg["normalizer"] diff --git a/models/ggml-vocab-ruri-large.gguf b/models/ggml-vocab-ruri-large.gguf new file mode 100644 index 0000000000000..9445c76b1b974 Binary files /dev/null and b/models/ggml-vocab-ruri-large.gguf differ diff --git a/models/ggml-vocab-ruri-large.gguf.inp b/models/ggml-vocab-ruri-large.gguf.inp new file mode 100644 index 0000000000000..86b934e4020fb --- /dev/null +++ b/models/ggml-vocab-ruri-large.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Äpfel +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-ruri-large.gguf.out b/models/ggml-vocab-ruri-large.gguf.out new file mode 100644 index 0000000000000..5ff26a45e72e1 --- /dev/null +++ b/models/ggml-vocab-ruri-large.gguf.out @@ -0,0 +1,46 @@ + 88 13247 35 32 1 33 92 18336 7095 7045 + 1 + + + + + + + + + + 32004 29944 102 28789 + 32004 29944 102 28789 + 32004 29944 18520 + 32004 29944 18520 + 32004 29944 18520 16 + 32004 29944 27 102 28789 16 + 32004 29944 27 102 28789 16 + 14152 12741 23274 1 29 82 16003 + 102 16435 7187 38 99 7069 25460 7099 83 7045 7094 7222 7095 7069 + 1 1 1 + 1 + 1 23 31304 7048 21907 7071 24 1 1 1 23 92 19760 14698 12835 84 7073 7075 32061 7045 26430 30214 16061 23624 16061 7094 24 1 23 18446 16157 84 7073 7075 32061 14152 12648 22106 7045 21801 7045 94 7070 7044 17253 20903 7044 24 + 32004 29944 + 32004 29944 + 32004 29944 + 32004 29944 + 32004 29944 + 32004 29944 32004 29944 + 23 + 44 + 22 84 14469 + 32004 29944 27 104 22 28187 16 55 13544 21369 7084 23418 1 46 2366 2263 1448 80 16003 12835 17228 22230 17880 23055 1589 109 + 16 16 16 16 16 16 + 34 + 13590 + 13590 7083 + 13590 17209 + 13590 17209 7083 + 13590 17209 17209 + 13590 17209 17209 7083 + 13590 17209 17209 17209 + 13590 17209 17209 17209 7083 + 1 1 + 23283 23637 14194 7045 + 1 23 31304 7048 21907 7071 24 1 1 1 23 92 19760 14698 12835 84 7073 7075 32061 7045 26430 30214 16061 23624 16061 7094 24 1 1 34 13590 13590 7083 13590 17209 13590 17209 7083 13590 17209 17209 13590 17209 17209 7083 13590 17209 17209 17209 34 29 34 34 29 29 34 34 29 29 29 34 1 46 2366 2263 1448 80 16003 12835 17228 22230 17880 23055 1589 109 26810 7509 7509 7509 7509 8741 8741 8741 8741 8741 8741 8741 1 1 1 22 22 22 22 22 22 79 79 79 79 79 8669 8669 7329 7329 7329 7329 28042 28042 28042 7508 7508 7508 7508 7508 7508 8134 8134 8134 8134 8134 8134 56 22 101 7084 24992 12620 22 17253 16344 30334 22 98 13891 12940 27 22 18898 23418 31114 12940 46 22 60 31304 7046 31114 12940 56 22 91 7071 92 19897 21801 27 22 51 23418 91 26183 98 19851 30713 7043 46 21452 22 69 7084 80 22 91 7159