Fix kq

jukofyork · web-flow · commit 7b66649a3272 · 2025-04-15T13:08:17.000+01:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1235,15 +1235,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
     } else {
         // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply
+        ggml_tensor * kq = nullptr;
         if (ggml_is_contiguous(k) && ggml_is_contiguous(q) && n_head_kv == 1) {
             k = ggml_reshape_2d(ctx0, k, n_embd, n_tokens);
             q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head);
-            ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+            kq = ggml_mul_mat(ctx0, k, q);
             // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here
             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
             kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head);
         } else {
-            ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+            kq = ggml_mul_mat(ctx0, k, q);
             // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here
             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
         }