vulkan: Fuse rope+set_rows (#16769)

jeffbolznv · web-flow · commit b9ce94017729 · 2025-10-29T15:13:10.000-05:00
This pattern appears in a lot of models, the rope operation is applied right
before storing into the KV cache (usually on the K tensor).

Add a path to some of the rope shaders that computes the destination address
based on the set_rows tensor. Compile variants of the shader with D_TYPE of
f16 (the usual KV cache type).

Add a src3 operand to ggml_vk_op_f32 - sometimes rope uses three srcs and needs
the fourth for the row indices.

Add fused_ops_write_mask to indicate which intermediate tensors need to write
their results to memory. Skipping writing the roped K value helps to allow more
nodes to run concurrently.

Add logic to ggml_vk_graph_optimize to make ROPE+VIEW+SET_ROWS consecutive. It
rarely starts out that way in the graph.

Add new backend tests.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
@@ -10,6 +10,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer Y {int data_pos[];};
 layout (binding = 2) readonly buffer Z {float data_ff[];};
 layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
+layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows
 
 layout (push_constant) uniform parameter {
     uint ncols;
@@ -27,6 +28,7 @@ layout (push_constant) uniform parameter {
     uint s2;
     int sections[4];
     uint is_back;
+    uint set_rows_stride;
 } p;
 
 float rope_yarn_ramp(const float low, const float high, const uint i0) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -16,12 +16,19 @@ void main() {
     const uint row_x     = row_dst % ne1;
     const uint channel_x = row_dst / ne1;
 
-    const uint idst = row_dst*ne0 + i0/2;
+    uint idst = row_dst*ne0 + i0/2;
     const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
 
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
+    if (p.set_rows_stride != 0) {
+        idst = row_x*ne0 + i0/2;
+        idst += data_i[channel_x].x * p.set_rows_stride;
+    }
+
     if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
-        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
+        data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
+        data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
 
         return;
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
@@ -16,12 +16,19 @@ void main() {
     const uint row_x     = row_dst % ne1;
     const uint channel_x = row_dst / ne1;
 
-    const uint idst = row_dst*ne0 + i0;
+    uint idst = row_dst*ne0 + i0;
     const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;
 
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
+    if (p.set_rows_stride != 0) {
+        idst = row_x*ne0 + i0;
+        idst += data_i[channel_x].x * p.set_rows_stride;
+    }
+
     if (i0 >= p.n_dims) {
-        data_d[idst + 0] = data_a[ix + 0];
-        data_d[idst + 1] = data_a[ix + 1];
+        data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
+        data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
 
         return;
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -842,10 +842,14 @@ void process_shaders() {
     string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
     string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
     string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
     string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
     string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -2125,6 +2125,34 @@ struct test_get_rows_back : public test_case {
     }
 };
 
+static void init_set_rows_row_ids(ggml_tensor * t, int num_rows) {
+    std::random_device rd;
+    std::default_random_engine rng(rd());
+    for (int i2 = 0; i2 < t->ne[2]; i2++) {
+        for (int i1 = 0; i1 < t->ne[1]; i1++) {
+            // generate a shuffled subset of row indices
+            std::vector<int64_t> data(num_rows);
+            for (int i = 0; i < num_rows; i++) {
+                data[i] = i;
+            }
+            std::shuffle(data.begin(), data.end(), rng);
+            data.resize(t->ne[0]);
+
+            const size_t offs = i1*t->nb[1] + i2*t->nb[2];
+            if (t->type == GGML_TYPE_I32) {
+                // TODO: Make a template or something
+                std::vector<int32_t> data_i32(t->ne[0]);
+                for (int i = 0; i < t->ne[0]; i++) {
+                    data_i32[i] = static_cast<int32_t>(data[i]);
+                }
+                ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
+            } else {
+                ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
+            }
+        }
+    }
+}
+
 // GGML_OP_SET_ROWS
 struct test_set_rows : public test_case {
     const ggml_type type;
@@ -2168,37 +2196,13 @@ struct test_set_rows : public test_case {
     }
 
     void initialize_tensors(ggml_context * ctx) override {
-        std::random_device rd;
-        std::default_random_engine rng(rd());
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
                 if (ggml_is_view_op(t->op)) {
                     continue;
                 }
 
-                for (int i2 = 0; i2 < t->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < t->ne[1]; i1++) {
-                        // generate a shuffled subset of row indices
-                        std::vector<int64_t> data(ne[1]);
-                        for (int i = 0; i < ne[1]; i++) {
-                            data[i] = i;
-                        }
-                        std::shuffle(data.begin(), data.end(), rng);
-                        data.resize(t->ne[0]);
-
-                        const size_t offs = i1*t->nb[1] + i2*t->nb[2];
-                        if (t->type == GGML_TYPE_I32) {
-                            // TODO: Make a template or something
-                            std::vector<int32_t> data_i32(t->ne[0]);
-                            for (int i = 0; i < t->ne[0]; i++) {
-                                data_i32[i] = static_cast<int32_t>(data[i]);
-                            }
-                            ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
-                        } else {
-                            ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
-                        }
-                    }
-                }
+                init_set_rows_row_ids(t, ne[1]);
             } else {
                 init_tensor_uniform(t);
             }
@@ -2227,6 +2231,67 @@ struct test_set_rows : public test_case {
     }
 };
 
+// GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS
+struct test_rope_set_rows : public test_case {
+    const ggml_type type;
+    const ggml_type type_idx;
+    const std::array<int64_t, 4> ne;
+    int mode;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, type_idx, ne, mode);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "ROPE_SET_ROWS";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    test_rope_set_rows(ggml_type type,
+            ggml_type type_idx,
+            std::array<int64_t, 4> ne,
+            int mode)
+        : type(type), type_idx(type_idx), ne(ne), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+
+        ggml_tensor * rope = ggml_rope(ctx, src, pos, ne[0], mode);
+
+        ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
+
+        ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
+        ggml_set_name(dst, "dst");
+
+        ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne[2], 1, 1);
+        ggml_set_name(row_idxs, "row_idxs");
+
+        ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                init_set_rows_row_ids(t, ne[2]);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
 // GGML_OP_ARGMAX
 struct test_argmax : public test_case {
     const ggml_type type;
@@ -6163,6 +6228,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX }) {
+        for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 1, 100 }, mode));
+            test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 512, 1 }, mode));
+        }
+    }
+
     for (ggml_type type_input : {GGML_TYPE_F32}) {
         for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
             for (int k0 : {1, 3}) {