@@ -11701,8 +11701,6 @@ struct llm_build_plm : public llm_graph_context {
1170111701 ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
1170211702 cb(k_pe, "k_pe", il);
1170311703
11704- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
11705- kv_compressed = ggml_cont(ctx0, kv_compressed);
1170611704 kv_compressed = build_norm(kv_compressed,
1170711705 model.layers[il].attn_kv_a_norm, NULL,
1170811706 LLM_NORM_RMS, il);
@@ -11734,7 +11732,6 @@ struct llm_build_plm : public llm_graph_context {
1173411732 0);
1173511733 cb(v_states, "v_states", il);
1173611734
11737- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
1173811735 q_pe = ggml_rope_ext(
1173911736 ctx0, q_pe, inp_pos, nullptr,
1174011737 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -11743,7 +11740,6 @@ struct llm_build_plm : public llm_graph_context {
1174311740 cb(q_pe, "q_pe", il);
1174411741
1174511742 // shared RoPE key
11746- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
1174711743 k_pe = ggml_rope_ext(
1174811744 ctx0, k_pe, inp_pos, nullptr,
1174911745 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
0 commit comments