Fix to use hidden_size_per_head

mitmul · mitmul · commit 433782bc71ee · 2025-09-18T13:28:32.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -4204,8 +4204,9 @@ def set_gguf_parameters(self):
 
         self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
         self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
+        self.gguf_writer.add_features_length(hparams.get("hidden_size_per_head", 128))
         self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
+        self.gguf_writer.add_wkv_head_size(hparams.get("num_attention_heads", 32))
         self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
         self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3371,11 +3371,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     const uint32_t d_state            = hparams.ssm_d_state;
                     const uint32_t num_heads          = hparams.ssm_dt_rank;
                     const uint32_t intermediate_size  = hparams.ssm_d_inner;
-                    const uint32_t head_dim           = intermediate_size / num_heads;
+                    const uint32_t head_dim           = hparams.wkv_head_size;
                     const uint32_t qk_dim             = head_dim;
                     const uint32_t v_dim              = head_dim;
-                    const int64_t num_attention_heads = hparams.n_head();
-                    const int64_t q_num_heads         = num_attention_heads;
                     const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
 
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3392,6 +3390,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         auto & layer = layers[i];
                         bool is_mamba_layer = hparams.is_recurrent(i);
 
+                        const int64_t num_attention_heads = hparams.n_head_kv_arr[i];
+                        const int64_t q_num_heads         = num_attention_heads;
+
                         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
                         if (is_mamba_layer) {