From 2a016d9c21c296131323fcbf908761f2c795d10d Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Thu, 3 Oct 2024 12:03:53 +0200 Subject: [PATCH 1/2] Support for Minerva 7B --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + include/llama.h | 1 + src/llama-vocab.cpp | 1 + src/llama.cpp | 3 +++ 5 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index da5feb25b1961..ee125eb2021a3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -654,6 +654,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 022354a3b624e..30d7c614774a1 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -101,6 +101,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, + {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, ] diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2e5b8..6000a1e4e21e5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -103,6 +103,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d2f34ddd6b339..bbe12c2a96236 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_EXAONE: + case LLAMA_VOCAB_PRE_TYPE_MINERVA: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/src/llama.cpp b/src/llama.cpp index 69ba6539599ef..04ff4d568827f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6444,6 +6444,9 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "minerva-7b") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From d9698c39d375d9244e0020b535f2dc58e422d4a0 Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Thu, 5 Dec 2024 13:13:56 +0100 Subject: [PATCH 2/2] Update convert_hf_to_gguf_update.py --- convert_hf_to_gguf_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 356473dd1ab67..1832c36bff35c 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -102,7 +102,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, - {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, + {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, ]