@@ -13463,7 +13463,7 @@ struct llm_build_context {
1346313463 0);
1346413464 cb(kv_cache_trans, "kv_cache_trans", il);
1346513465
13466- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13466+ // q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
1346713467 q_pe = ggml_rope_ext(
1346813468 ctx0, q_pe, inp_pos, nullptr,
1346913469 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13472,7 +13472,7 @@ struct llm_build_context {
1347213472 cb(q_pe, "q_pe", il);
1347313473
1347413474 // shared RoPE key
13475- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13475+ // k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
1347613476 k_pe = ggml_rope_ext(
1347713477 ctx0, k_pe, inp_pos, nullptr,
1347813478 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13508,15 +13508,17 @@ struct llm_build_context {
1350813508 struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
1350913509 cb(kq_nope, "kq_nope", il);
1351013510
13511- struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13512- cb(q_pe_perm, "q_pe_perm", il);
13511+ // Huh? This is not used anywhere
13512+ //struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
13513+ //cb(q_pe_perm, "q_pe_perm", il);
1351313514
1351413515 struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
1351513516 cb(kq_pe, "kq_pe", il);
1351613517
1351713518 struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
1351813519 cb(kq, "kq", il);
1351913520
13521+ // We need this copy because soft_max expects a contiguous tensor
1352013522 kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
1352113523 cb(kq, "kq_perm", il);
1352213524
0 commit comments