@@ -5935,6 +5935,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
59355935 const uint32_t n_embd_head_qk_rope = hparams.n_rot ;
59365936 const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot ;
59375937 const uint32_t kv_lora_rank = hparams.n_lora_kv ;
5938+ const uint32_t q_lora_rank = hparams.n_lora_q ;
59385939
59395940 struct ggml_tensor * cur;
59405941 struct ggml_tensor * inpL;
@@ -5961,68 +5962,96 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
59615962
59625963 // self_attention
59635964 {
5964- struct ggml_tensor * q = NULL ;
5965- if (!is_lite) {
5966- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
5967- q = ggml_mul_mat (ctx0, model.layers [il].wq_a , cur);
5968- cb (q, " q" , il);
5969-
5970- q = llm_build_norm (ctx0, q, hparams, model.layers [il].attn_q_a_norm , NULL , LLM_NORM_RMS, cb, il);
5971- cb (q, " q" , il);
5972-
5973- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
5974- q = ggml_mul_mat (ctx0, model.layers [il].wq_b , q);
5975- cb (q, " q" , il);
5976- } else {
5977- q = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
5978- cb (q, " q" , il);
5965+ ggml_tensor * q = nullptr ;
5966+ ggml_tensor * kv_rope_compressed = nullptr ;
5967+ ggml_tensor * q_rope;
5968+ ggml_tensor * q_nope;
5969+ ggml_tensor * k_rope;
5970+ ggml_tensor * kv_compressed;
5971+ if (model.layers [il].wkq_a_mqa ) {
5972+ auto mqa = ggml_mul_mat (ctx0, model.layers [il].wkq_a_mqa , cur);
5973+ cb (mqa, " mqa" , il);
5974+ size_t qnb1;
5975+ if (!is_lite) {
5976+ q = ggml_view_2d (ctx0, mqa, q_lora_rank, n_tokens, mqa->nb [1 ], 0 );
5977+ q = llm_build_norm (ctx0, q, hparams, model.layers [il].attn_q_a_norm , NULL , LLM_NORM_RMS, cb, il);
5978+ q = ggml_mul_mat (ctx0, model.layers [il].wq_b , q);
5979+ qnb1 = q->nb [1 ];
5980+ cb (q, " q" , il);
5981+ kv_rope_compressed = ggml_view_2d (ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb [1 ],
5982+ q_lora_rank*ggml_element_size (mqa));
5983+ } else {
5984+ q = ggml_view_2d (ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb [1 ], 0 );
5985+ kv_rope_compressed = ggml_view_2d (ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb [1 ],
5986+ n_embd_k_gqa*ggml_element_size (mqa));
5987+ qnb1 = mqa->nb [1 ];
5988+ }
5989+ q_nope = ggml_view_3d (ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
5990+ ggml_row_size (q->type , hparams.n_embd_head_k ), qnb1, 0 );
5991+ q_rope = ggml_view_3d (ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
5992+ ggml_row_size (q->type , hparams.n_embd_head_k ), qnb1, ggml_row_size (q->type , n_embd_head_qk_nope));
5993+ k_rope = ggml_view_3d (ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1 , n_tokens,
5994+ mqa->nb [1 ], mqa->nb [1 ], ggml_row_size (kv_rope_compressed->type , kv_lora_rank));
5995+ kv_compressed = ggml_view_2d (ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb [1 ], 0 );
59795996 }
5997+ else {
5998+ if (!is_lite) {
5999+ q = ggml_mul_mat (ctx0, model.layers [il].wq_a , cur);
6000+ cb (q, " q" , il);
59806001
5981- // split into {n_head * n_embd_head_qk_nope, n_tokens}
5982- struct ggml_tensor * q_nope = ggml_view_3d (ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
5983- ggml_row_size (q->type , hparams.n_embd_head_k ),
5984- ggml_row_size (q->type , hparams.n_embd_head_k * n_head),
5985- 0 );
5986- cb (q_nope, " q_nope" , il);
6002+ kv_rope_compressed = ggml_mul_mat (ctx0, model.layers [il].wkv_a_mqa , cur);
6003+ cb (kv_rope_compressed, " kv_rope_compressed" , il);
59876004
5988- // and {n_head * n_embd_head_qk_rope, n_tokens}
5989- struct ggml_tensor * q_rope = ggml_view_3d (ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
5990- ggml_row_size (q->type , hparams.n_embd_head_k ),
5991- ggml_row_size (q->type , hparams.n_embd_head_k * n_head),
5992- ggml_row_size (q->type , n_embd_head_qk_nope));
5993- cb (q_rope, " q_rope" , il);
6005+ ggml_build_forward_expand (gf, q);
6006+ ggml_build_forward_expand (gf, kv_rope_compressed);
59946007
5995- q_rope = ggml_rope_ext (
5996- ctx0, q_rope, inp_pos, nullptr ,
5997- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5998- ext_factor, attn_factor_scaled, beta_fast, beta_slow
5999- );
6000- cb (q_rope, " q_rope" , il);
6008+ q = llm_build_norm (ctx0, q, hparams, model.layers [il].attn_q_a_norm , NULL , LLM_NORM_RMS, cb, il);
6009+ cb (q, " q" , il);
6010+
6011+ q = ggml_mul_mat (ctx0, model.layers [il].wq_b , q);
6012+ cb (q, " q" , il);
6013+ } else {
6014+ q = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
6015+ cb (q, " q" , il);
60016016
6002- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
6003- struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat (ctx0, model.layers [il].wkv_a_mqa , cur);
6004- cb (kv_rope_compresseed, " kv_rope_compresseed" , il);
6017+ kv_rope_compressed = ggml_mul_mat (ctx0, model.layers [il].wkv_a_mqa , cur);
6018+ cb (kv_rope_compressed, " kv_rope_compressed" , il);
60056019
6006- // and {n_embd_head_qk_rope, n_tokens}
6007- struct ggml_tensor * k_rope = ggml_view_3d (ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1 , n_tokens,
6008- kv_rope_compresseed->nb [1 ],
6009- kv_rope_compresseed->nb [1 ],
6010- ggml_row_size (kv_rope_compresseed->type , kv_lora_rank));
6020+ ggml_build_forward_expand (gf, q);
6021+ ggml_build_forward_expand (gf, kv_rope_compressed);
6022+ }
6023+
6024+ q_nope = ggml_view_3d (ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
6025+ ggml_row_size (q->type , hparams.n_embd_head_k ),
6026+ ggml_row_size (q->type , hparams.n_embd_head_k * n_head), 0 );
6027+
6028+ q_rope = ggml_view_3d (ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
6029+ ggml_row_size (q->type , hparams.n_embd_head_k ),
6030+ ggml_row_size (q->type , hparams.n_embd_head_k * n_head),
6031+ ggml_row_size (q->type , n_embd_head_qk_nope));
6032+
6033+ k_rope = ggml_view_3d (ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1 , n_tokens,
6034+ kv_rope_compressed->nb [1 ],
6035+ kv_rope_compressed->nb [1 ],
6036+ ggml_row_size (kv_rope_compressed->type , kv_lora_rank));
6037+
6038+ kv_compressed = ggml_view_2d (ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
6039+ kv_rope_compressed->nb [1 ], 0 );
6040+ }
6041+ cb (q_nope, " q_nope" , il);
6042+ cb (q_rope, " q_rope" , il);
60116043 cb (k_rope, " k_rope" , il);
6044+ cb (kv_compressed, " kv_compressed" , il);
60126045
6013- // shared RoPE key
6014- k_rope = ggml_rope_ext (
6015- ctx0, k_rope, inp_pos, nullptr ,
6046+ q_rope = ggml_rope_ext (ctx0, q_rope, inp_pos, nullptr ,
60166047 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6017- ext_factor, attn_factor_scaled, beta_fast, beta_slow
6018- );
6019- cb (k_rope, " k_rope" , il);
6048+ ext_factor, attn_factor_scaled, beta_fast, beta_slow);
6049+ cb (q_rope, " q_rope" , il);
60206050
6021- // split into {kv_lora_rank, n_tokens}
6022- struct ggml_tensor * kv_compressed = ggml_view_2d (ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens,
6023- kv_rope_compresseed->nb [1 ],
6024- 0 );
6025- cb (kv_compressed, " kv_compressed" , il);
6051+ k_rope = ggml_rope_ext (ctx0, k_rope, inp_pos, nullptr ,
6052+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6053+ ext_factor, attn_factor_scaled, beta_fast, beta_slow);
6054+ cb (k_rope, " k_rope" , il);
60266055
60276056 kv_compressed = llm_build_norm (ctx0, kv_compressed, hparams, model.layers [il].attn_kv_a_norm , NULL , LLM_NORM_RMS, cb, il);
60286057 cb (kv_compressed, " kv_compressed" , il);
0 commit comments