CUDA: add fused rope

am17an · am17an · commit ea859a2ad7cc · 2025-10-31T20:07:22.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2964,6 +2964,36 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 }
 #endif
 
+static bool ggml_cuda_should_fuse_rope_set_rows(const ggml_tensor * rope,
+                                                const ggml_tensor * view,
+                                                const ggml_tensor * set_rows) {
+    // ne3 not tested
+    if (rope->src[0]->ne[3] != 1) {
+        return false;
+    }
+
+    if (set_rows->type != GGML_TYPE_F32 && set_rows->type != GGML_TYPE_F16) {
+        return false;
+    }
+
+    if (set_rows->src[1]->type != GGML_TYPE_I64) {
+        return false;
+    }
+
+    // The view should flatten two dims of rope into one dim
+    if (!ggml_is_contiguous(view) || view->ne[0] != rope->ne[0] * rope->ne[1]) {
+        return false;
+    }
+
+    // Only norm/neox shaders have the fusion code
+    const int mode = ((const int32_t *) rope->op_params)[2];
+    if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
+        return false;
+    }
+
+    return true;
+}
+
 static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
 #ifndef NDEBUG
     const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
@@ -3039,6 +3069,16 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         }
     }
 
+    if (ops.size() == 3 && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
+        const ggml_tensor * rope     = cgraph->nodes[node_idx];
+        const ggml_tensor * view     = cgraph->nodes[node_idx + 1];
+        const ggml_tensor * set_rows = cgraph->nodes[node_idx + 2];
+
+        if (ggml_cuda_should_fuse_rope_set_rows(rope, view, set_rows)) {
+            return true;
+        }
+    }
+
     if (!ggml_can_fuse(cgraph, node_idx, ops)) {
         return false;
     }
@@ -3170,6 +3210,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         continue;
                     }
 
+                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
+                        ggml_tensor * src3 = cgraph->nodes[i + 2]->src[1];
+                        ggml_tensor * rope = cgraph->nodes[i];
+                        ggml_tensor * dst  = cgraph->nodes[i + 2];
+
+                        rope->src[3] = src3;
+                        rope->data   = dst->data;
+                        rope->type   = dst->type;
+
+                        const size_t set_rows_stride = dst->nb[1] / ggml_type_size(dst->type);
+                        ggml_set_op_params_i32(rope, 15, set_rows_stride);
+                        ggml_cuda_op_rope(*cuda_ctx, rope);
+                        i += 2;
+                        continue;
+                    }
+
                     if (node->op == GGML_OP_MUL) {
                         int current_node = i + 1;
                         int num_views    = 0;
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
@@ -37,11 +37,23 @@ static __device__ void rope_yarn(
     }
 }
 
-template<bool forward, bool has_ff, typename T>
-static __global__ void rope_norm(
-        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
-        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
+template <bool forward, bool has_ff, typename T, typename D>
+static __global__ void rope_norm(const T *            x,
+                                 D *                  dst,
+                                 const int            ne0,
+                                 const int            ne1,
+                                 const int            s1,
+                                 const int            s2,
+                                 const int            n_dims,
+                                 const int32_t *      pos,
+                                 const float          freq_scale,
+                                 const float          ext_factor,
+                                 const float          attn_factor,
+                                 const rope_corr_dims corr_dims,
+                                 const float          theta_scale,
+                                 const float *        freq_factors,
+                                 const int64_t *      row_indices,
+                                 const int            set_rows_stride) {
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (i0 >= ne0) {
@@ -53,12 +65,19 @@ static __global__ void rope_norm(
     const int row_x     = row_dst % ne1;
     const int channel_x = row_dst / ne1;
 
-    const int idst = row_dst*ne0 + i0;
+    int       idst = row_dst * ne0 + i0;
     const int ix   = channel_x*s2 + row_x*s1 + i0;
 
+    // Fusion optimization: ROPE + VIEW + SET_ROWS.
+    // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
+    if (set_rows_stride != 0) {
+        idst = row_x * ne0 + i0;
+        idst += row_indices[channel_x] * set_rows_stride;
+    }
+
     if (i0 >= n_dims) {
-        dst[idst + 0] = x[ix + 0];
-        dst[idst + 1] = x[ix + 1];
+        dst[idst + 0] = D(x[ix + 0]);
+        dst[idst + 1] = D(x[ix + 1]);
 
         return;
     }
@@ -75,15 +94,27 @@ static __global__ void rope_norm(
     const float x0 = x[ix + 0];
     const float x1 = x[ix + 1];
 
-    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
-    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0] = D(x0 * cos_theta - x1 * sin_theta);
+    dst[idst + 1] = D(x0 * sin_theta + x1 * cos_theta);
 }
 
-template<bool forward, bool has_ff, typename T>
-static __global__ void rope_neox(
-        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
-        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
+template <bool forward, bool has_ff, typename T, typename D>
+static __global__ void rope_neox(const T *            x,
+                                 D *                  dst,
+                                 const int            ne0,
+                                 const int            ne1,
+                                 const int            s1,
+                                 const int            s2,
+                                 const int            n_dims,
+                                 const int32_t *      pos,
+                                 const float          freq_scale,
+                                 const float          ext_factor,
+                                 const float          attn_factor,
+                                 const rope_corr_dims corr_dims,
+                                 const float          theta_scale,
+                                 const float *        freq_factors,
+                                 const int64_t *      row_indices,
+                                 const int            set_rows_stride) {
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (i0 >= ne0) {
@@ -95,12 +126,19 @@ static __global__ void rope_neox(
     const int row_x     = row_dst % ne1;
     const int channel_x = row_dst / ne1;
 
-    const int idst = row_dst*ne0 + i0/2;
+    int       idst = row_dst * ne0 + i0 / 2;
     const int ix   = channel_x*s2 + row_x*s1 + i0/2;
 
+    // Fusion optimization: ROPE + VIEW + SET_ROWS.
+    // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
+    if (set_rows_stride != 0) {
+        idst = row_x * ne0 + i0 / 2;
+        idst += row_indices[channel_x] * set_rows_stride;
+    }
+
     if (i0 >= n_dims) {
-        dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
-        dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
+        dst[idst + i0 / 2 + 0] = D(x[ix + i0 / 2 + 0]);
+        dst[idst + i0 / 2 + 1] = D(x[ix + i0 / 2 + 1]);
 
         return;
     }
@@ -117,8 +155,8 @@ static __global__ void rope_neox(
     const float x0 = x[ix + 0];
     const float x1 = x[ix + n_dims/2];
 
-    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0]          = D(x0 * cos_theta - x1 * sin_theta);
+    dst[idst + n_dims / 2] = D(x0 * sin_theta + x1 * cos_theta);
 }
 
 template<bool forward, bool has_ff, typename T>
@@ -238,11 +276,25 @@ static __global__ void rope_vision(
     dst[idst + n_dims] = x0*sin_theta + x1*cos_theta;
 }
 
-template<bool forward, typename T>
-static void rope_norm_cuda(
-        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
-        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+template <bool forward, typename T, typename D>
+static void rope_norm_cuda(const T *            x,
+                           D *                  dst,
+                           const int            ne0,
+                           const int            ne1,
+                           const int            s1,
+                           const int            s2,
+                           const int            n_dims,
+                           const int            nr,
+                           const int32_t *      pos,
+                           const float          freq_scale,
+                           const float          freq_base,
+                           const float          ext_factor,
+                           const float          attn_factor,
+                           const rope_corr_dims corr_dims,
+                           const float *        freq_factors,
+                           const int64_t *      row_indices,
+                           const int            set_rows_stride,
+                           cudaStream_t         stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -252,20 +304,34 @@ static void rope_norm_cuda(
 
     if (freq_factors == nullptr) {
         rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors);
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
+            freq_factors, row_indices, set_rows_stride);
     } else {
         rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors);
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
+            freq_factors, row_indices, set_rows_stride);
     }
 }
 
-template<bool forward, typename T>
-static void rope_neox_cuda(
-        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
-        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+template <bool forward, typename T, typename D>
+static void rope_neox_cuda(const T *            x,
+                           D *                  dst,
+                           const int            ne0,
+                           const int            ne1,
+                           const int            s1,
+                           const int            s2,
+                           const int            n_dims,
+                           const int            nr,
+                           const int32_t *      pos,
+                           const float          freq_scale,
+                           const float          freq_base,
+                           const float          ext_factor,
+                           const float          attn_factor,
+                           const rope_corr_dims corr_dims,
+                           const float *        freq_factors,
+                           const int64_t *      row_indices,
+                           const int            set_rows_stride,
+                           cudaStream_t         stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -274,13 +340,13 @@ static void rope_neox_cuda(
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     if (freq_factors == nullptr) {
-        rope_neox<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors);
+        rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
+            freq_factors, row_indices, set_rows_stride);
     } else {
-        rope_neox<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors);
+        rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
+            freq_factors, row_indices, set_rows_stride);
     }
 }
 
@@ -337,6 +403,7 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
     const ggml_tensor * src2 = dst->src[2];
+    const ggml_tensor * src3 = dst->src[3];
 
     const float * src0_d = (const float *)src0->data;
     const float * src1_d = (const float *)src1->data;
@@ -346,7 +413,9 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
+    // When not fused, src0 and dst types must match
+    // When fused (ROPE+VIEW+SET_ROWS), src0 may be F32 and dst may be F16
+    GGML_ASSERT(src0->type == dst->type || dst->type == GGML_TYPE_F16);
 
     const int64_t ne00 = src0->ne[0]; // head dims
     const int64_t ne01 = src0->ne[1]; // num heads
@@ -399,19 +468,32 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
         freq_factors = (const float *) src2->data;
     }
 
+    // Row indices for fused ROPE + VIEW + SET_ROWS
+    const int64_t * row_indices     = nullptr;
+    int             set_rows_stride = 0;
+    if (src3 != nullptr) {
+        GGML_ASSERT(src3->type == GGML_TYPE_I64);
+        row_indices     = (const int64_t *) src3->data;
+        set_rows_stride = ggml_get_op_params_i32(dst, 15);
+    }
+
     rope_corr_dims corr_dims;
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
 
     // compute
     if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda<forward>(
-                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda<forward>(
-                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
+                                                  nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                  freq_factors, row_indices, set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+            rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
+                                                 nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                 freq_factors, row_indices, set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+            rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
+                                                pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                freq_factors, row_indices, set_rows_stride, stream);
         } else {
             GGML_ABORT("fatal error");
         }
@@ -440,14 +522,18 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
             GGML_ABORT("fatal error");
         }
     } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_norm_cuda<forward>(
-                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_norm_cuda<forward>(
-                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
+                                                  nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                  freq_factors, row_indices, set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+            rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
+                                                 nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                 freq_factors, row_indices, set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+            rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
+                                                pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                                                freq_factors, row_indices, set_rows_stride, stream);
         } else {
             GGML_ABORT("fatal error");
         }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1592,14 +1592,15 @@ ggml_tensor * llm_graph_context::build_attn(
             int       il) const {
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
+    // expand k later to enable rope fusion which directly writes into k-v cache
     ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
     const auto * mctx_cur = inp->mctx;
 
     // store to KV cache
     {
+        ggml_build_forward_expand(gf, k_cur);
         const auto & k_idxs = inp->get_k_idxs();
         const auto & v_idxs = inp->get_v_idxs();