From 66c374c79aeecb9fd9a2b5d3e7331edf14627597 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:29:32 +0100 Subject: [PATCH 1/8] permute `Qcur` instead of `q_nope_absorbed` --- src/llama-model.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 248c61748eaa8..2a9d4c2f695bc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10151,15 +10151,15 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope); cb(q_nope_absorbed, "q_nope_absorbed", il); - // {kv_lora_rank, n_head, n_tokens} - q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); - cb(q_nope_absorbed, "q_nope_absorbed_perm", il); - - // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + // {n_embd_head_qk_rope + kv_lora_rank, n_tokens, n_head} // note: rope must go first for in-place context shifting in build_rope_shift() ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); cb(Qcur, "Qcur", il); + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + cb(Qcur, "Qcur_perm", il); + kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); cb(kv_cmpr, "kv_cmpr_reshape", il); From 0525166a80eed0bd841722867b155d11911ae2c0 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:37:29 +0100 Subject: [PATCH 2/8] Also need to permute `q_pe` --- src/llama-model.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2a9d4c2f695bc..cda4db4674de9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1,4 +1,4 @@ -#include "llama-model.h" +dee#include "llama-model.h" #include "llama-impl.h" #include "llama-mmap.h" @@ -10143,6 +10143,10 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(kv_cmpr, "kv_cmpr", il); if (is_mla) { + // {n_embd_head_qk_rope, n_tokens, n_head} + q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); + cb(q_pe, "q_pe_perm", il); + // {n_embd_head_qk_nope, n_tokens, n_head} q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope, "q_nope_perm", il); From 3c4423c4ffcede1172f02133b8c99de83a4b3950 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:41:43 +0100 Subject: [PATCH 3/8] removed random "dee" characters added --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cda4db4674de9..dfbe5f6aea9b5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1,4 +1,4 @@ -dee#include "llama-model.h" +#include "llama-model.h" #include "llama-impl.h" #include "llama-mmap.h" From 05184613f4ad9c028ee059a485c61f095bac3cb6 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:45:15 +0100 Subject: [PATCH 4/8] cont `vCur` --- src/llama-model.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index dfbe5f6aea9b5..ed62191ea8d9d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10175,6 +10175,9 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_tensor * Vcur = kv_cmpr; cb(Vcur, "Vcur", il); + Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "Vcur_cont", il); + // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, From f2fcd2c15c0ef488ad55a93a702b9fc4634c3c3d Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:00:52 +0100 Subject: [PATCH 5/8] Add back `MQA` 2D x 2D optimisation --- src/llama-graph.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 5d0222b981058..333a19c681813 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1203,9 +1203,10 @@ ggml_tensor * llm_graph_context::build_attn_mha( // note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1]; - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - const auto n_kv = k->ne[1]; + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + const auto n_kv = k->ne[1]; + const auto n_head_kv = k->ne[2]; ggml_tensor * cur; @@ -1233,12 +1234,21 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { + // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply + if (ggml_is_contiguous(q) && n_head_kv == 1) { + q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); + } + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range // while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + if (ggml_is_contiguous(q) && n_head_kv == 1) { + kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head); + } + if (arch == LLM_ARCH_GROK) { // need to do the following: // multiply by attn_output_multiplyer of 0.08838834764831845 From 1d63edf9b330896f4104263ec5efa1d23d8dc615 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:06:00 +0100 Subject: [PATCH 6/8] Check `k` is contiguous and reshape it --- src/llama-graph.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 333a19c681813..6f353f8d601fd 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1235,18 +1235,17 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply - if (ggml_is_contiguous(q) && n_head_kv == 1) { + if (ggml_is_contiguous(k) && ggml_is_contiguous(q) && n_head_kv == 1) { + k = ggml_reshape_2d(ctx0, k, n_embd, n_tokens); q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); - } - - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - if (ggml_is_contiguous(q) && n_head_kv == 1) { + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head); + } else { + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); } if (arch == LLM_ARCH_GROK) { From 7b66649a327261721ce3bb4eb5e56e2c3a721edd Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:08:17 +0100 Subject: [PATCH 7/8] Fix `kq` --- src/llama-graph.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6f353f8d601fd..b07438292418c 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1235,15 +1235,16 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply + ggml_tensor * kq = nullptr; if (ggml_is_contiguous(k) && ggml_is_contiguous(q) && n_head_kv == 1) { k = ggml_reshape_2d(ctx0, k, n_embd, n_tokens); q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head); } else { - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); } From 959a793005964acc8e19e730ea2582aa62f1fab9 Mon Sep 17 00:00:00 2001 From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:16:26 +0100 Subject: [PATCH 8/8] Added missing `n_embd` and fixed `n_kv` bug --- src/llama-graph.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b07438292418c..1d7400446cc52 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1203,8 +1203,10 @@ ggml_tensor * llm_graph_context::build_attn_mha( // note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1]; + const auto n_embd = q->ne[0]; const auto n_tokens = q->ne[1]; const auto n_head = q->ne[2]; + const auto n_kv = k->ne[1]; const auto n_head_kv = k->ne[2]; @@ -1237,7 +1239,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply ggml_tensor * kq = nullptr; if (ggml_is_contiguous(k) && ggml_is_contiguous(q) && n_head_kv == 1) { - k = ggml_reshape_2d(ctx0, k, n_embd, n_tokens); + k = ggml_reshape_2d(ctx0, k, n_embd, n_kv); q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range while for some models F16 is enough, for others it is not, so we default to F32 here