From 2a016d9c21c296131323fcbf908761f2c795d10d Mon Sep 17 00:00:00 2001
From: Riccardo Orlando <orlandoricc@gmail.com>
Date: Thu, 3 Oct 2024 12:03:53 +0200
Subject: [PATCH 1/2] Support for Minerva 7B

---
 convert_hf_to_gguf.py        | 3 +++
 convert_hf_to_gguf_update.py | 1 +
 include/llama.h              | 1 +
 src/llama-vocab.cpp          | 1 +
 src/llama.cpp                | 3 +++
 5 files changed, 9 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index da5feb25b1961..ee125eb2021a3 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -654,6 +654,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
             # ref: https://huggingface.co/facebook/chameleon-7b
             res = "chameleon"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
 
         if res is None:
             logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 022354a3b624e..30d7c614774a1 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -101,6 +101,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
     {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
     {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
+    {"name": "minerva-7b",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
 ]
 
 
diff --git a/include/llama.h b/include/llama.h
index 7cae1bbe2e5b8..6000a1e4e21e5 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -103,6 +103,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
     };
 
     enum llama_rope_type {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d2f34ddd6b339..bbe12c2a96236 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
             case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
             case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+            case LLAMA_VOCAB_PRE_TYPE_MINERVA:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
diff --git a/src/llama.cpp b/src/llama.cpp
index 69ba6539599ef..04ff4d568827f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6444,6 +6444,9 @@ static void llm_load_vocab(
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "minerva-7b") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }

From d9698c39d375d9244e0020b535f2dc58e422d4a0 Mon Sep 17 00:00:00 2001
From: Riccardo Orlando <Riccorl@users.noreply.github.com>
Date: Thu, 5 Dec 2024 13:13:56 +0100
Subject: [PATCH 2/2] Update convert_hf_to_gguf_update.py

---
 convert_hf_to_gguf_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 356473dd1ab67..1832c36bff35c 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -102,7 +102,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
     {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
     {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
 ]