@@ -1270,6 +1270,7 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
12701270
12711271std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_build_mul_mat_qkv (ggml_cgraph * gf, ggml_tensor * cur,
12721272 ggml_tensor * wqkv, ggml_tensor * bqkv,
1273+ ggml_tensor * wqk, ggml_tensor * bqk,
12731274 ggml_tensor * wq, ggml_tensor * bq,
12741275 ggml_tensor * wk, ggml_tensor * bk,
12751276 ggml_tensor * wv, ggml_tensor * bv,
@@ -1307,6 +1308,40 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
13071308 // ggml_build_forward_expand(gf, Vcur);
13081309 }
13091310
1311+ if (wqk) {
1312+ auto qk = llm_build_lora_mm (lctx, ctx0, wqk, cur);
1313+ cb (qk, " qkv" , il);
1314+ if (bqk) {
1315+ qk = ggml_add (ctx0, qk, bqk);
1316+ cb (qk, " qkv_b" , il);
1317+ }
1318+ auto Vcur = llm_build_lora_mm (lctx, ctx0, wv, cur);
1319+ cb (Vcur, " Vcur" , il);
1320+ if (bv) {
1321+ Vcur = ggml_add (ctx0, Vcur, bv);
1322+ cb (Vcur, " Vcur" , il);
1323+ }
1324+ ggml_build_forward_expand (gf, qk);
1325+ ggml_build_forward_expand (gf, Vcur);
1326+ auto Qcur = ggml_view_3d (ctx0, qk, n_embd_head, n_head, n_tokens, n_embd_head*sizeof (float ), qk->nb [1 ], 0 *sizeof (float )*(n_embd));
1327+ auto Kcur = ggml_view_3d (ctx0, qk, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof (float ), qk->nb [1 ], 1 *sizeof (float )*Qcur->ne [0 ]*Qcur->ne [1 ]);
1328+ cb (Qcur, " Qcur" , il);
1329+ cb (Kcur, " Kcur" , il);
1330+ if (q_norm) {
1331+ Qcur = llm_build_norm (ctx0, Qcur, hparams, model.layers [il].attn_q_norm , NULL , LLM_NORM_RMS, cb, il);
1332+ cb (Qcur, " Qcur_normed" , il);
1333+ ggml_build_forward_expand (gf, Qcur);
1334+ }
1335+ if (k_norm) {
1336+ Kcur = llm_build_norm (ctx0, Kcur, hparams, model.layers [il].attn_k_norm , NULL , LLM_NORM_RMS, cb, il);
1337+ cb (Kcur, " Kcur_normed" , il);
1338+ ggml_build_forward_expand (gf, Kcur);
1339+ }
1340+
1341+ return {Qcur, Kcur, Vcur};
1342+
1343+ }
1344+
13101345 auto [Q, K, V] = llm_build_mul_mat_qkv (gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il);
13111346 auto Qcur = ggml_reshape_3d (ctx0, Q, n_embd_head, n_head, n_tokens);
13121347 if (q_norm) {
@@ -1374,6 +1409,7 @@ ggml_cgraph * llm_build_context::build_llama() {
13741409
13751410 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
13761411 model.layers [il].wqkv , model.layers [il].bqkv ,
1412+ model.layers [il].wqk , model.layers [il].bqk ,
13771413 model.layers [il].wq , model.layers [il].bq ,
13781414 model.layers [il].wk , model.layers [il].bk ,
13791415 model.layers [il].wv , model.layers [il].bv ,
@@ -3400,6 +3436,7 @@ ggml_cgraph * llm_build_context::build_qwen3() {
34003436 {
34013437 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
34023438 model.layers [il].wqkv , nullptr ,
3439+ model.layers [il].wqk , nullptr ,
34033440 model.layers [il].wq , nullptr ,
34043441 model.layers [il].wk , nullptr ,
34053442 model.layers [il].wv , nullptr ,
@@ -3502,6 +3539,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
35023539 {
35033540 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
35043541 model.layers [il].wqkv , nullptr ,
3542+ model.layers [il].wqk , nullptr ,
35053543 model.layers [il].wq , nullptr , model.layers [il].wk , nullptr , model.layers [il].wv , nullptr ,
35063544 model.layers [il].attn_q_norm , model.layers [il].attn_k_norm , 0 , il);
35073545
@@ -6403,6 +6441,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
64036441 {
64046442 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
64056443 model.layers [il].wqkv , model.layers [il].bqkv ,
6444+ model.layers [il].wqk , model.layers [il].bqk ,
64066445 model.layers [il].wq , model.layers [il].bq ,
64076446 model.layers [il].wk , model.layers [il].bk ,
64086447 model.layers [il].wv , model.layers [il].bv ,
@@ -6814,6 +6853,7 @@ ggml_cgraph * llm_build_context::build_cohere2() {
68146853
68156854 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
68166855 model.layers [il].wqkv , model.layers [il].bqkv ,
6856+ model.layers [il].wqk , model.layers [il].bqk ,
68176857 model.layers [il].wq , model.layers [il].bq ,
68186858 model.layers [il].wk , model.layers [il].bk ,
68196859 model.layers [il].wv , model.layers [il].bv , nullptr , nullptr , 0 .f , il);
@@ -8116,6 +8156,7 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
81168156 {
81178157 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur,
81188158 model.layers [il].wqkv , model.layers [il].bqkv ,
8159+ model.layers [il].wqk , model.layers [il].bqk ,
81198160 model.layers [il].wq , model.layers [il].bq ,
81208161 model.layers [il].wk , model.layers [il].bk ,
81218162 model.layers [il].wv , model.layers [il].bv ,
@@ -8234,7 +8275,7 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() {
82348275 // self_attention
82358276 {
82368277 auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv (gf, cur, model.layers [il].wqkv , model.layers [il].bqkv ,
8237- nullptr , nullptr , nullptr , nullptr , nullptr , nullptr ,
8278+ nullptr , nullptr , nullptr , nullptr , nullptr , nullptr , nullptr , nullptr ,
82388279 model.layers [il].attn_q_norm , model.layers [il].attn_k_norm , 0 .0f , il);
82398280
82408281 if (rope_cache) {
0 commit comments