Better/shorter variable names and more tidying up of code

jukofyork · jukofyork · commit 937a48d539d1 · 2025-04-02T22:34:16.000+01:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1478,7 +1478,7 @@ ggml_tensor * llm_graph_context::build_attn(
 ggml_tensor * llm_graph_context::build_attn_mla(
         llm_graph_input_attn_kv_unified * inp,
         ggml_cgraph * gf,
-        ggml_tensor * wv_decompress,
+        ggml_tensor * wv_b,
         ggml_tensor * wo,
         ggml_tensor * q_cur,
         ggml_tensor * k_cur,
@@ -1497,8 +1497,8 @@ ggml_tensor * llm_graph_context::build_attn_mla(
     const auto kv_lora_rank = hparams.n_lora_kv;
 
     // note: deepseek with MLA option converts into MQA with larger n_ebed (ie: GQA with 1 group)
-    const int64_t n_embd_k_compressed = kv_lora_rank + hparams.n_rot;
-    const int64_t n_embd_v_compressed = kv_lora_rank;
+    const int64_t n_embd_k_cmpr = kv_lora_rank + hparams.n_rot;
+    const int64_t n_embd_v_cmpr = kv_lora_rank;
 
     // note: this is the smaller n_ebed what we get after decompression
     const int64_t n_embd_head_v = hparams.n_embd_head_v;
@@ -1514,17 +1514,17 @@ ggml_tensor * llm_graph_context::build_attn_mla(
         GGML_ASSERT(kv_self->size == n_ctx);
 
         ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il],
-                n_tokens*n_embd_k_compressed,
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_compressed)*kv_head);
+                n_tokens*n_embd_k_cmpr,
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_cmpr)*kv_head);
         //cb(k_cache_view, "k_cache_view", il);
 
         // note: storing RoPE-ed version of K in the KV cache
         ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
 
-        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_compressed, n_tokens);
+        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_cmpr, n_tokens);
 
         ggml_tensor * v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il],
-                n_tokens, n_embd_v_compressed,
+                n_tokens, n_embd_v_cmpr,
                 (  n_ctx)*ggml_element_size(kv_self->v_l[il]),
                 (kv_head)*ggml_element_size(kv_self->v_l[il]));
 
@@ -1543,34 +1543,34 @@ ggml_tensor * llm_graph_context::build_attn_mla(
 
     const auto n_kv = kv_self->n;
 
-    ggml_tensor * k_compressed = ggml_view_2d(ctx0, kv_self->k_l[il],
-            n_embd_k_compressed, n_kv,
-            ggml_row_size(kv_self->k_l[il]->type, n_embd_k_compressed),
+    ggml_tensor * k_cmpr = ggml_view_2d(ctx0, kv_self->k_l[il],
+            n_embd_k_cmpr, n_kv,
+            ggml_row_size(kv_self->k_l[il]->type, n_embd_k_cmpr),
             0);
-    cb(k_compressed, "k_compressed", il);
+    cb(k_cmpr, "k_cmpr", il);
 
-    struct ggml_tensor * v_compressed_trans = ggml_view_2d(ctx0, kv_self->v_l[il],
-            n_kv, n_embd_v_compressed,
+    struct ggml_tensor * v_cmpr_trans = ggml_view_2d(ctx0, kv_self->v_l[il],
+            n_kv, n_embd_v_cmpr,
             ggml_element_size(kv_self->v_l[il])*n_ctx,
             0);
-    cb(v_compressed_trans, "v_compressed_trans", il);
+    cb(v_cmpr_trans, "v_cmpr_trans", il);
 
-    ggml_tensor * q_compressed = ggml_view_2d(ctx0, q_cur,
-            n_embd_k_compressed, n_tokens*n_head,
-            ggml_row_size(q_cur->type, n_embd_k_compressed),
+    ggml_tensor * q_cmpr = ggml_view_2d(ctx0, q_cur,
+            n_embd_k_cmpr, n_tokens*n_head,
+            ggml_row_size(q_cur->type, n_embd_k_cmpr),
             0);
-    cb(q_compressed, "q_compressed", il);
+    cb(q_cmpr, "q_cmpr", il);
 
-    ggml_tensor * kq = ggml_mul_mat(ctx0, k_compressed, q_compressed);
-    cb(kq, "kq", il);
+    ggml_tensor * kq_cmpr = ggml_mul_mat(ctx0, k_cmpr, q_cmpr);
+    cb(kq_cmpr, "kq_cmpr", il);
 
-    kq = ggml_view_3d(ctx0, kq, n_kv, n_tokens, n_head,
-            ggml_row_size(kq->type, n_kv),
-            ggml_row_size(kq->type, n_kv)*n_tokens,
+    kq_cmpr = ggml_view_3d(ctx0, kq_cmpr, n_kv, n_tokens, n_head,
+            ggml_row_size(kq_cmpr->type, n_kv),
+            ggml_row_size(kq_cmpr->type, n_kv)*n_tokens,
             0);
-    cb(kq, "kq_view", il);
+    cb(kq_cmpr, "kq_view", il);
 
-    ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+    ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq_cmpr, kq_mask, kq_scale, hparams.f_max_alibi_bias);
     cb(kq_soft_max, "kq_soft_max", il);
 
     kq_soft_max = ggml_view_2d(ctx0, kq_soft_max,
@@ -1579,24 +1579,24 @@ ggml_tensor * llm_graph_context::build_attn_mla(
             0);
     cb(kq_soft_max, "kq_soft_max_view", il);
 
-    ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, v_compressed_trans, kq_soft_max);
-    cb(kqv_compressed, "kqv_compressed,", il);
+    ggml_tensor * kqv_cmpr = ggml_mul_mat(ctx0, v_cmpr_trans, kq_soft_max);
+    cb(kqv_cmpr, "kqv_cmpr,", il);
 
-    kqv_compressed = ggml_view_3d(ctx0, kqv_compressed,
-            n_embd_v_compressed, n_tokens, n_head,
-            ggml_row_size(kqv_compressed->type, n_embd_v_compressed),
-            ggml_row_size(kqv_compressed->type, n_embd_v_compressed)*n_tokens,
+    kqv_cmpr = ggml_view_3d(ctx0, kqv_cmpr,
+            n_embd_v_cmpr, n_tokens, n_head,
+            ggml_row_size(kqv_cmpr->type, n_embd_v_cmpr),
+            ggml_row_size(kqv_cmpr->type, n_embd_v_cmpr)*n_tokens,
             0);
-    cb(kqv_compressed, "kqv_compressed_view", il);
+    cb(kqv_cmpr, "kqv_cmpr_view", il);
 
-    ggml_tensor * wv_decompress_view = ggml_view_3d(ctx0, wv_decompress,
-            n_embd_v_compressed, n_embd_head_v, n_head,
-            ggml_row_size(wv_decompress->type, n_embd_v_compressed),
-            ggml_row_size(wv_decompress->type, n_embd_v_compressed)*n_embd_head_v,
+    ggml_tensor * wv_b_view = ggml_view_3d(ctx0, wv_b,
+            n_embd_v_cmpr, n_embd_head_v, n_head,
+            ggml_row_size(wv_b->type, n_embd_v_cmpr),
+            ggml_row_size(wv_b->type, n_embd_v_cmpr)*n_embd_head_v,
             0);
-    cb(wv_decompress_view, "wv_decompress_view", il);
+    cb(wv_b_view, "wv_b_view", il);
 
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_decompress_view, kqv_compressed);
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_view, kqv_cmpr);
     cb(kqv, "kqv", il);
 
     kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -540,7 +540,7 @@ struct llm_graph_context {
     ggml_tensor * build_attn_mla(
             llm_graph_input_attn_kv_unified * inp,
             ggml_cgraph * gf,
-            ggml_tensor * wv_decompress,
+            ggml_tensor * wv_b,
             ggml_tensor * wo,
             ggml_tensor * q_cur, // [n_embd_k, n_tokens, n_head]
             ggml_tensor * k_cur, // [n_embd_k, n_tokens]
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -9571,21 +9571,21 @@ struct llm_build_deepseek2 : public llm_graph_context {
                         ggml_row_size(q->type, n_embd_head_qk_nope));
                 cb(q_pe, "q_pe", il);
 
-                ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+                ggml_tensor * kv_pe_cmprresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_cmprresseed, "kv_pe_cmprresseed", il);
 
                 // split into {kv_lora_rank, n_tokens}
-                ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                        kv_pe_compresseed->nb[1],
+                ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_pe_cmprresseed, kv_lora_rank, n_tokens,
+                        kv_pe_cmprresseed->nb[1],
                         0);
-                cb(kv_compressed, "kv_compressed", il);
+                cb(kv_cmpr, "kv_cmpr", il);
 
                 // and {n_embd_head_qk_rope, n_tokens}
-                ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed,
+                ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_cmprresseed,
                         n_embd_head_qk_rope, 1, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        kv_pe_compresseed->nb[1],
-                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                        kv_pe_cmprresseed->nb[1],
+                        kv_pe_cmprresseed->nb[1],
+                        ggml_row_size(kv_pe_cmprresseed->type, kv_lora_rank));
                 cb(k_pe, "k_pe", il);
 
                 // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
@@ -9605,11 +9605,11 @@ struct llm_build_deepseek2 : public llm_graph_context {
                 cb(k_pe, "k_pe", il);
 
                 // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
-                kv_compressed = ggml_cont(ctx0, kv_compressed);
-                kv_compressed = build_norm(kv_compressed,
+                kv_cmpr = ggml_cont(ctx0, kv_cmpr);
+                kv_cmpr = build_norm(kv_cmpr,
                         model.layers[il].attn_kv_a_norm, nullptr,
                         LLM_NORM_RMS, il);
-                cb(kv_compressed, "kv_compressed", il);
+                cb(kv_cmpr, "kv_cmpr", il);
 
                 if (cparams.mla_attn) {
                     // note: deepseek with MLA option converts into MQA (ie: GQA with 1 group)
@@ -9633,20 +9633,18 @@ struct llm_build_deepseek2 : public llm_graph_context {
                             0);
                     cb(wk_b, "wk_b", il);
 
-                    // note: this operation *MUST* use F32 (or have `wk_b` stored as F32 or BF16 in the GGUF)
+                    // note: this operation *MUST* use F32 or it will cause gibberish output
                     ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope);
-                    //if (wk_b->type != GGML_TYPE_F32 && wk_b->type != GGML_TYPE_BF16) {
-                        ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
-                    //}
+                    ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
                     cb(q_nope_absorbed, "q_nope_absorbed", il);
 
                     ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
                     cb(q_states, "q_states", il);
 
-                    ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed, k_pe, 0);
+                    ggml_tensor * k_states = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
                     cb(k_states, "k_states", il);
 
-                    ggml_tensor * v_states = kv_compressed;
+                    ggml_tensor * v_states = kv_cmpr;
                     cb(v_states, "v_states", il);
 
                     cur = build_attn_mla(inp_attn, gf,
@@ -9655,27 +9653,25 @@ struct llm_build_deepseek2 : public llm_graph_context {
                 } else {
                     // note: deepseek without MLA option converts into MHA
 
-                    // note: this operation *MUST* use F32 (or have `wkv_b` stored as F32 or BF16 in the GGUF)
-                    ggml_tensor * kv_decompressed = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
-                    //if (model.layers[il].wkv_b->type != GGML_TYPE_F32 && model.layers[il].wkv_b->type != GGML_TYPE_BF16) {
-                        ggml_mul_mat_set_prec(kv_decompressed, GGML_PREC_F32);
-                    //}
-                    cb(kv_decompressed, "kv_decompressed", il);
+                    // note: this operation *MUST* use F32 or it will cause gibberish output
+                    ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
+                    ggml_mul_mat_set_prec(kv, GGML_PREC_F32);
+                    cb(kv, "kv", il);
 
                     // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                    ggml_tensor * k_nope = ggml_view_3d(ctx0, kv_decompressed,
+                    ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
                             n_embd_head_qk_nope, n_head, n_tokens,
-                            ggml_row_size(kv_decompressed->type, n_embd_head_qk_nope + n_embd_head_v),
-                            ggml_row_size(kv_decompressed->type, n_head * (n_embd_head_qk_nope + n_embd_head_v)),
+                            ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
+                            ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + n_embd_head_v)),
                             0);
                     cb(k_nope, "k_nope", il);
 
                     // and {n_head * n_embd_head_v, n_tokens}
-                    ggml_tensor * v_states = ggml_view_3d(ctx0, kv_decompressed,
+                    ggml_tensor * v_states = ggml_view_3d(ctx0, kv,
                             n_embd_head_v, n_head, n_tokens,
-                            ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope + n_embd_head_v)),
-                            ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope + n_embd_head_v)*n_head),
-                            ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope)));
+                            ggml_row_size(kv->type, (n_embd_head_qk_nope + n_embd_head_v)),
+                            ggml_row_size(kv->type, (n_embd_head_qk_nope + n_embd_head_v)*n_head),
+                            ggml_row_size(kv->type, (n_embd_head_qk_nope)));
                     cb(v_states, "v_states", il);
 
                     v_states = ggml_cont(ctx0, v_states);