concatenate split tensors and add more metadata

CISC · web-flow · commit 6b3f7755bdbb · 2025-08-26T23:26:16.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2717,27 +2717,56 @@ def set_gguf_parameters(self):
         if (rope_dim := self.hparams.get("head_dim")) is None:
             rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 
+        # Treat "original" as "yarn", seems to have been a mistake
+        if self.hparams.get("rope_type") in ("yarn", "original"):
+            # config.json values differ from standard, we may have to add metadata for these:
+            # extrapolation_factor = 1.0
+            # attn_factor = 1.0
+            # beta_fast = 8
+            # beta_slow = 1
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
+
+        if temp_len := self.hparams.get("attn_temperature_len"):
+            self.gguf_writer.add_attn_temperature_length(temp_len)
+
         self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
         self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
         self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
 
-    _experts: list[dict[str, Tensor]] | None = None
+    _experts: list[dict[str, list[Tensor]]] | None = None
+    _cur_expert = ""
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        tensors: list[tuple[str, Tensor]] = []
+        is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
+
+        if not is_expert:
+            tensors.append((self.map_tensor_name(name), data_torch))
+
         # process the experts separately
-        if name.find(".moe.") != -1 or name.find(".block_sparse_moe.experts.") != -1:
+        if is_expert or self._cur_expert:
             n_experts = self.hparams["num_local_experts"]
 
             assert bid is not None
 
             if self._experts is None:
                 self._experts = [{} for _ in range(self.block_count)]
 
-            self._experts[bid][name] = data_torch
+            # concatenate split tensors
+            if name in self._experts[bid]:
+                self._cur_expert = name
+                self._experts[bid][name].append(data_torch)
+                return []
+            elif is_expert:
+                self._cur_expert = name
+                self._experts[bid][name] = [data_torch]
+                return []
+            else:
+                self._cur_expert = ""
 
             if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
                 # merge the experts into a single 3d tensor
                 for wid in [("linear", "w1"), ("linear_1", "w2"), ("linear_v", "w3")]:
                     datas: list[Tensor] = []
@@ -2746,7 +2775,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                         ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
                         if ename not in self._experts[bid]:
                             ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
-                        datas.append(self._experts[bid][ename])
+                        tensor_list = self._experts[bid][ename]
+                        datas.append(torch.hstack(tensor_list) if len(tensor_list) > 1 else tensor_list[0])
                         del self._experts[bid][ename]
 
                     data_torch = torch.stack(datas, dim=0)
@@ -2756,11 +2786,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                     new_name = self.map_tensor_name(merged_name)
 
                     tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
 
-        return [(self.map_tensor_name(name), data_torch)]
+        return tensors
 
 
 @ModelBase.register("DbrxForCausalLM")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -147,6 +147,7 @@ class Attention:
         SLIDING_WINDOW               = "{arch}.attention.sliding_window"
         SCALE                        = "{arch}.attention.scale"
         OUTPUT_SCALE                 = "{arch}.attention.output_scale"
+        TEMPERATURE_LENGTH           = "{arch}.attention.temperature_length"
         KEY_LENGTH_MLA               = "{arch}.attention.key_length_mla"
         VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
         SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -832,6 +832,9 @@ def add_attention_scale(self, value: float) -> None:
     def add_attn_output_scale(self, value: float) -> None:
         self.add_float32(Keys.Attention.OUTPUT_SCALE.format(arch=self.arch), value)
 
+    def add_attn_temperature_length(self, value: float) -> None:
+        self.add_float32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -167,6 +167,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
+    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -171,6 +171,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
+    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -134,7 +134,10 @@ struct llama_hparams {
     float f_residual_scale  = 0.0f;
     float f_embedding_scale = 0.0f;
     float f_attention_scale = 0.0f;
+
+    // grok-2
     float f_attn_out_scale  = 0.0f;
+    float f_attn_temp_len   = 0.0f;
 
     bool causal_attn   = true;
     bool use_alibi     = false;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -693,13 +693,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 // no final_logit_softcapping in grok-1
                 hparams.f_final_logit_softcapping = 0.0f;
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, false);
-                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
-                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,      hparams.f_attn_out_scale, false);
-                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
-                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,    hparams.f_router_logit_softcapping, false);
-                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
+                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
+                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
+                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
+
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.f_attn_temp_len, false);
 
                 switch (hparams.n_layer) {
                     case 64: type = LLM_TYPE_314B; break;