From 77612580064bddb8de0e9a2b7983fd20184f1024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 18 Mar 2025 19:54:34 +0100
Subject: [PATCH 1/5] Add support for GPT2, Bloom and CodeShell tied word
 embeddings

---
 src/llama-model.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9171585bd9d91..ae583e53ebdf3 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     // output
                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                     output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     // output
                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                     output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_CODESHELL:
                 {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if tok embd is NULL, init from output
+                    if (tok_embd == NULL) {
+                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     // output
                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);

From faea5ffd479ea05073796031aa76dee55f300ddc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 18 Mar 2025 19:56:13 +0100
Subject: [PATCH 2/5] Deduplicate tied word embeddings weights

---
 convert_hf_to_gguf.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index d13d57c54154a..c2b56402de92e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1099,13 +1099,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         tensors.append((self.map_tensor_name(name), data_torch))
 
-        if name == "word_embeddings.weight":
-            assert self.tensor_names is not None
-
-            # TODO: tie them at runtime, don't duplicate in the model file
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
         return tensors
 
 
@@ -2404,10 +2397,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         tensors.append((new_name, data_torch))
 
-        # note: GPT2 output is tied to (same as) wte in original model
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
         return tensors
 
 
@@ -2737,22 +2726,6 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
         self.gguf_writer.add_rope_scaling_factor(1.0)
 
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        new_name = self.map_tensor_name(name)
-
-        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
-
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            assert self.tensor_names is not None
-
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                # copy tok_embd.weight to output.weight
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
-        return tensors
-
 
 @Model.register("InternLM2ForCausalLM")
 class InternLM2Model(Model):

From dc36338e6cdbbb4741ceff16e4907f1401f2e1de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 18 Mar 2025 23:44:31 +0100
Subject: [PATCH 3/5] Workaround for incorrect weight map

It appears transformer.wte.weight is in the weight map even though the weights are not there, remove it if output weights are encountered first.
---
 convert_hf_to_gguf.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c2b56402de92e..e65e9982558cb 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
             missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
             if len(extra) == 0 and len(missing_files) > 0:
-                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+                raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
+                                 f"Missing tensors: {missing}")
             else:
                 raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
                                  f"Missing tensors: {missing}\n"
@@ -2726,6 +2727,27 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
         self.gguf_writer.add_rope_scaling_factor(1.0)
 
+    _has_tok_embd = False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
+
+        new_name = self.map_tensor_name(name)
+
+        # assuming token_embd.weight is seen before output.weight
+        if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
+            # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
+            if "transformer.wte.weight" in self.tensor_names:
+                logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
+                self.tensor_names.remove("transformer.wte.weight")
+        elif new_name == tok_embd_name:
+            self._has_tok_embd = True
+
+        return [(new_name, data_torch)]
+
 
 @Model.register("InternLM2ForCausalLM")
 class InternLM2Model(Model):

From 0cc8cb57da8fef6d2cf1e8a135d4166921241742 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 18 Mar 2025 23:51:04 +0100
Subject: [PATCH 4/5] check++

---
 convert_hf_to_gguf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e65e9982558cb..b8e6d8b19d0bb 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -659,8 +659,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             # ref: https://huggingface.co/core42/jais-13b
             res = "jais"
         if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
-            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
-            res = "codeshell"
+            # ref: https://huggingface.co/WisdomShell/-7B
+            res = ""
         if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
             # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
             res = "tekken"
@@ -2740,7 +2740,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         # assuming token_embd.weight is seen before output.weight
         if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
             # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
-            if "transformer.wte.weight" in self.tensor_names:
+            if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
                 logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
                 self.tensor_names.remove("transformer.wte.weight")
         elif new_name == tok_embd_name:

From 1b2a53a8a16c1e2d96b3fd6a816d0ccde5a44f4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 19 Mar 2025 00:06:56 +0100
Subject: [PATCH 5/5] fatfingers--

---
 convert_hf_to_gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index b8e6d8b19d0bb..586a1a85d2830 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -659,8 +659,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             # ref: https://huggingface.co/core42/jais-13b
             res = "jais"
         if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
-            # ref: https://huggingface.co/WisdomShell/-7B
-            res = ""
+            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
+            res = "codeshell"
         if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
             # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
             res = "tekken"