@@ -582,7 +582,7 @@ static struct ggml_tensor * llm_build_kqv(
582582
583583 // split cached v into n_head heads (not transposed)
584584 struct ggml_tensor * v =
585- ggml_view_3d (ctx, kv.v_l [il],
585+ ggml_view_3d (ctx, kv.v_l [il],
586586 n_embd_head_v, n_kv, n_head_kv,
587587 ggml_row_size (kv.v_l [il]->type , n_embd_v_gqa),
588588 ggml_row_size (kv.v_l [il]->type , n_embd_head_v),
@@ -592,8 +592,7 @@ static struct ggml_tensor * llm_build_kqv(
592592 struct ggml_tensor * padded_v = v;
593593 int64_t n_embd_head_v_out = n_embd_head_v;
594594 if (n_embd_head_v < n_embd_head_k) {
595- // Pad the feature dimension (assuming it's the third dimension, adjust indices as per actual tensor layout)
596- padded_v = ggml_pad (ctx, v, 0 , 0 , k->ne [2 ] - v->ne [2 ], 0 ); // Correct dimension for feature padding
595+ padded_v = ggml_pad (ctx, v, 0 , k->ne [0 ] - v->ne [1 ], 0 , 0 );
597596 cb (padded_v, " padded_v" , il);
598597 n_embd_head_v_out = n_embd_head_k;
599598 }
@@ -603,6 +602,15 @@ static struct ggml_tensor * llm_build_kqv(
603602
604603 ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
605604
605+ if (n_embd_head_v < n_embd_head_k) {
606+ cur = ggml_reshape_3d (ctx, cur, n_embd_head_v_out, n_head, n_tokens);
607+ cur = ggml_view_3d (ctx, cur, n_embd_head_v, n_head, n_tokens,
608+ ggml_element_size (cur) * n_embd_head_v_out,
609+ ggml_element_size (cur) * n_embd_head_v_out * n_head,
610+ 0 );
611+ cur = ggml_cont (ctx, cur);
612+ }
613+
606614 cur = ggml_reshape_2d (ctx, cur, n_embd_head_v*n_head, n_tokens);
607615 } else {
608616 struct ggml_tensor * kq = ggml_mul_mat (ctx, k, q);
0 commit comments