@@ -5670,12 +5670,10 @@ struct llm_build_falcon : public llm_graph_context {
56705670 cur = build_lora_mm(model.layers[il].wqkv, cur);
56715671 cb(cur, "wqkv", il);
56725672
5673- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
5674- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
5673+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5674+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
56755675 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
56765676
5677- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5678- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
56795677 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
56805678
56815679 // using mode = 2 for neox mode
@@ -5952,12 +5950,10 @@ struct llm_build_dbrx : public llm_graph_context {
59525950 cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
59535951 cb(cur, "wqkv_clamped", il);
59545952
5955- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
5956- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
5953+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5954+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
59575955 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
59585956
5959- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5960- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
59615957 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
59625958
59635959 Qcur = ggml_rope_ext(
@@ -6468,12 +6464,10 @@ struct llm_build_neo_bert : public llm_graph_context {
64686464 cur = build_lora_mm(model.layers[il].wqkv, cur);
64696465 cb(cur, "wqkv", il);
64706466
6471- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6472- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6467+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6468+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
64736469 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
64746470
6475- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6476- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
64776471 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
64786472
64796473 // RoPE
@@ -6703,8 +6697,8 @@ struct llm_build_mpt : public llm_graph_context {
67036697 cb(cur, "wqkv_clamped", il);
67046698 }
67056699
6706- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6707- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6700+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
6701+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
67086702 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
67096703
67106704 cb(Qcur, "Qcur", il);
@@ -6724,6 +6718,12 @@ struct llm_build_mpt : public llm_graph_context {
67246718 model.layers[il].attn_k_norm_b,
67256719 LLM_NORM, il);
67266720 cb(Kcur, "Kcur", il);
6721+ } else {
6722+ Qcur = ggml_cont(ctx0, Qcur);
6723+ cb(Qcur, "Qcur", il);
6724+
6725+ Kcur = ggml_cont(ctx0, Kcur);
6726+ cb(Kcur, "Kcur", il);
67276727 }
67286728
67296729 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6978,12 +6978,10 @@ struct llm_build_qwen : public llm_graph_context {
69786978 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
69796979 cb(cur, "bqkv", il);
69806980
6981- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6982- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6981+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6982+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
69836983 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
69846984
6985- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6986- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
69876985 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
69886986
69896987 // using mode = 2 for neox mode
@@ -7748,21 +7746,21 @@ struct llm_build_phi2 : public llm_graph_context {
77487746 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
77497747 cb(cur, "bqkv", il);
77507748
7751- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
7752- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
7749+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7750+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
77537751 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
77547752 } else {
77557753 Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
77567754 Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
77577755 Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7756+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7757+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
77587758 }
77597759
77607760 cb(Qcur, "Qcur", il);
77617761 cb(Kcur, "Kcur", il);
77627762 cb(Vcur, "Vcur", il);
77637763
7764- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7765- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
77667764 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
77677765
77687766 Qcur = ggml_rope_ext(
@@ -7886,21 +7884,21 @@ struct llm_build_phi3 : public llm_graph_context {
78867884 cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
78877885 cb(cur, "wqkv", il);
78887886
7889- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd) ));
7890- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd) ));
7887+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
7888+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
78917889 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
78927890 } else {
78937891 Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
78947892 Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
78957893 Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7894+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7895+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
78967896 }
78977897
78987898 cb(Qcur, "Qcur", il);
78997899 cb(Kcur, "Kcur", il);
79007900 cb(Vcur, "Vcur", il);
79017901
7902- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7903- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
79047902 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
79057903
79067904 Qcur = ggml_rope_ext(
@@ -8256,12 +8254,10 @@ struct llm_build_codeshell : public llm_graph_context {
82568254 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
82578255 cb(cur, "bqkv", il);
82588256
8259- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
8260- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
8257+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8258+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
82618259 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
82628260
8263- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8264- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
82658261 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
82668262
82678263 Qcur = ggml_rope_ext(
@@ -8677,8 +8673,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
86778673 ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
86788674 cb(k_pe, "k_pe", il);
86798675
8680- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
8681- kv_compressed = ggml_cont(ctx0, kv_compressed);
86828676 kv_compressed = build_norm(kv_compressed,
86838677 model.layers[il].attn_kv_a_norm, NULL,
86848678 LLM_NORM_RMS, il);
@@ -8710,7 +8704,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
87108704 0);
87118705 cb(v_states, "v_states", il);
87128706
8713- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
87148707 q_pe = ggml_rope_ext(
87158708 ctx0, q_pe, inp_pos, rope_factors,
87168709 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8719,7 +8712,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
87198712 cb(q_pe, "q_pe", il);
87208713
87218714 // shared RoPE key
8722- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
87238715 k_pe = ggml_rope_ext(
87248716 ctx0, k_pe, inp_pos, rope_factors,
87258717 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -10784,10 +10776,10 @@ struct llm_build_openelm : public llm_graph_context {
1078410776
1078510777 cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
1078610778
10787- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0) );
10779+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
1078810780 cb(Qcur, "Qcur", il);
1078910781
10790- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head) );
10782+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
1079110783 cb(Kcur, "Kcur", il);
1079210784
1079310785 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
@@ -10909,12 +10901,10 @@ struct llm_build_gptneox : public llm_graph_context {
1090910901 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
1091010902 cb(cur, "bqkv", il);
1091110903
10912- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
10913- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
10904+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
10905+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
1091410906 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
1091510907
10916- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10917- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1091810908 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1091910909
1092010910 Qcur = ggml_rope_ext(
@@ -12159,20 +12149,20 @@ struct llm_build_chatglm : public llm_graph_context {
1215912149 if (model.layers[il].bv) {
1216012150 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
1216112151 }
12152+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12153+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1216212154 } else {
1216312155 cur = build_lora_mm(model.layers[il].wqkv, cur);
1216412156 cb(cur, "wqkv", il);
1216512157 if (model.layers[il].bqkv) {
1216612158 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
1216712159 cb(cur, "bqkv", il);
1216812160 }
12169- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
12170- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
12161+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12162+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
1217112163 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
1217212164 }
1217312165
12174- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12175- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1217612166 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1217712167
1217812168 //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -12293,20 +12283,20 @@ struct llm_build_glm4 : public llm_graph_context {
1229312283 if (model.layers[il].bv) {
1229412284 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
1229512285 }
12286+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12287+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1229612288 } else {
1229712289 cur = build_lora_mm(model.layers[il].wqkv, cur);
1229812290 cb(cur, "wqkv", il);
1229912291 if (model.layers[il].bqkv) {
1230012292 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
1230112293 cb(cur, "bqkv", il);
1230212294 }
12303- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
12304- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
12295+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12296+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
1230512297 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
1230612298 }
1230712299
12308- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12309- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1231012300 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1231112301
1231212302 Qcur = ggml_rope_ext(
0 commit comments