Work on rope variants

reeselevine · reeselevine · commit 89f6cefe9c17 · 2025-09-18T15:42:55.000-07:00
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -137,7 +137,7 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline mul_ip_pipeline[2];
     wgpu::ComputePipeline rms_norm_pipeline;
     wgpu::ComputePipeline rms_norm_ip_pipeline;
-    wgpu::ComputePipeline rope_pipeline[2][2];
+    wgpu::ComputePipeline rope_pipeline[2][2][2][2];  // type, mode, ff, inplace
 
     size_t memset_bytes_per_thread;
 
@@ -734,11 +734,17 @@ static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_t
     ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
-static void ggml_webgpu_rope(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * src2, ggml_tensor * dst) {
-    bool in_place = ggml_webgpu_tensor_equal(src0, dst);
-    int has_freq_factor = (src2 != nullptr);
-
-    float  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+static void ggml_webgpu_rope(webgpu_context & ctx,
+                             ggml_tensor *    src0,
+                             ggml_tensor *    src1,
+                             ggml_tensor *    src2,
+                             ggml_tensor *    dst) {
+    const int inplace         = ggml_webgpu_tensor_equal(src0, dst);
+    const int has_freq_factor = (src2 != nullptr);
+    const int mode            = ((int32_t *) dst->op_params)[2];
+    const int is_neox         = mode & GGML_ROPE_TYPE_NEOX;
+
+    float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
     const int n_dims     = ((int32_t *) dst->op_params)[1];
     const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
@@ -757,30 +763,25 @@ static void ggml_webgpu_rope(webgpu_context & ctx, ggml_tensor * src0, ggml_tens
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
+        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+        (uint32_t) ggml_nelements(src0) / 2,
+        (uint32_t) src0->ne[0],
+        (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2],
+        (uint32_t) n_dims,
+        *(uint32_t *) &theta_scale,
+        *(uint32_t *) &attn_factor,
+        *(uint32_t *) &freq_scale,
+        *(uint32_t *) &ext_factor,
+        *(uint32_t *) &corr_dims[0],
+        *(uint32_t *) &corr_dims[1]
     };
-    if (!in_place) {
-        params.push_back((uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)));
-    }
-    params.push_back((uint32_t) (src0->nb[1] / ggml_type_size(src0->type)));
-    params.push_back((uint32_t) (src0->nb[2] / ggml_type_size(src0->type)));
-    params.push_back((uint32_t) (src0->nb[3] / ggml_type_size(src0->type)));
-    if (!in_place) {
-        params.push_back((uint32_t) (dst->nb[1] / ggml_type_size(dst->type)));
-        params.push_back((uint32_t) (dst->nb[2] / ggml_type_size(dst->type)));
-        params.push_back((uint32_t) (dst->nb[3] / ggml_type_size(dst->type)));
-    }
-    params.push_back((uint32_t) ggml_nelements(src0) / 2);
-    params.push_back((uint32_t) src0->ne[0]);
-    params.push_back((uint32_t) src0->ne[1]);
-    params.push_back((uint32_t) src0->ne[2]);
-
-    params.push_back((uint32_t) n_dims);
-    params.push_back(*(uint32_t *) &theta_scale);
-    params.push_back(*(uint32_t *) &attn_factor);
-    params.push_back(*(uint32_t *) &freq_scale);
-    params.push_back(*(uint32_t *) &ext_factor);
-    params.push_back(*(uint32_t *) &corr_dims[0]);
-    params.push_back(*(uint32_t *) &corr_dims[1]);
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
@@ -800,21 +801,16 @@ static void ggml_webgpu_rope(webgpu_context & ctx, ggml_tensor * src0, ggml_tens
                             .offset  = ggml_webgpu_tensor_align_offset(ctx, src2),
                             .size    = ggml_webgpu_tensor_binding_size(ctx, src2) });
     }
-    if (!in_place) {
+    if (!inplace) {
         entries.push_back({ .binding = dst_binding,
                             .buffer  = ggml_webgpu_tensor_buf(dst),
                             .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    wgpu::ComputePipeline pipeline;
-    if (in_place) {
-        pipeline = ctx->rope_pipeline[dst->type][has_freq_factor];
-    } else {
-        pipeline = ctx->rope_pipeline[dst->type][has_freq_factor];
-    }
-    size_t   max_wg_size = ctx->max_wg_size_x;
-    uint32_t wg_x        = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
+    wgpu::ComputePipeline pipeline    = ctx->rope_pipeline[dst->type][is_neox][has_freq_factor][inplace];
+    size_t                max_wg_size = ctx->max_wg_size_x;
+    uint32_t              wg_x        = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
     ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
@@ -1290,10 +1286,22 @@ static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
 
 static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0], wgsl_rope_f32_norm, "rope_f32_norm", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1], wgsl_rope_f32_norm_ff, "rope_f32_norm_ff", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0], wgsl_rope_f16_norm, "rope_f16_norm", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1], wgsl_rope_f16_norm_ff, "rope_f16_norm_ff", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][0][0],
+                                wgsl_rope_f32_norm, "rope_f32_norm", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][0][1],
+                                wgsl_rope_f32_norm_inplace, "rope_f32_norm_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][1][0],
+                                wgsl_rope_f32_norm_ff, "rope_f32_norm_ff", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][1][1],
+                                wgsl_rope_f32_norm_ff_inplace, "rope_f32_norm_ff_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][0][0],
+                                wgsl_rope_f16_norm, "rope_f16_norm", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][0][1],
+                                wgsl_rope_f16_norm_inplace, "rope_f16_norm_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][1][0],
+                                wgsl_rope_f16_norm_ff, "rope_f16_norm_ff", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][1][1],
+                                wgsl_rope_f16_norm_ff_inplace, "rope_f16_norm_ff_inplace", constants);
 }
 
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl
@@ -6,54 +6,136 @@
     "REPLS": {
       "TYPE" : "f32",
     },
-    "DECLS": ["NO_FREQ_FAC"]
+    "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "NORM", "ROTATE"]
+  },
+  {
+    "SHADER_SUFFIX": "f32_norm_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+    },
+    "DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "NORM", "ROTATE_INPLACE"]
   },
   {
     "SHADER_SUFFIX": "f16_norm",
     "REPLS": {
       "TYPE" : "f16",
     },
-    "DECLS": ["NO_FREQ_FAC"]
+    "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "NORM", "ROTATE"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_norm_inplace",
+    "REPLS": {
+      "TYPE" : "f16",
+    },
+    "DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "NORM", "ROTATE_INPLACE"]
   },
   {
    "SHADER_SUFFIX": "f32_norm_ff",
     "REPLS": {
       "TYPE" : "f32",
     },
-    "DECLS": ["FREQ_FAC"]
+    "DECLS": ["FF_BINDINGS", "FF_FUNC", "NORM", "ROTATE"]
+  },
+  {
+   "SHADER_SUFFIX": "f32_norm_ff_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+    },
+    "DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "NORM", "ROTATE_INPLACE"]
   },
   {
     "SHADER_SUFFIX": "f16_norm_ff",
     "REPLS": {
       "TYPE" : "f16",
     },
-    "DECLS": ["FREQ_FAC"]
+    "DECLS": ["FF_BINDINGS", "FF_FUNC", "NORM", "ROTATE"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_norm_ff_inplace",
+    "REPLS": {
+      "TYPE" : "f16",
+    },
+    "DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "NORM", "ROTATE_INPLACE"]
+  },
+
+  {
+    "SHADER_SUFFIX": "f32_neox",
+    "REPLS": {
+      "TYPE" : "f32",
+    },
+    "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "NEOX", "ROTATE"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_neox",
+    "REPLS": {
+      "TYPE" : "f16",
+    },
+    "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "NEOX", "ROTATE"]
+  },
+  {
+   "SHADER_SUFFIX": "f32_neox_ff",
+    "REPLS": {
+      "TYPE" : "f32",
+    },
+    "DECLS": ["FF_BINDINGS", "FF_FUNC", "NEOX", "ROTATE"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_neox_ff",
+    "REPLS": {
+      "TYPE" : "f16",
+    },
+    "DECLS": ["FF_BINDINGS", "FF_FUNC", "NEOX", "ROTATE"]
   }
 ]
 
 #end(VARIANTS)
 
 #define(DECLS)
 
-#decl(NO_FREQ_FAC)
+#decl(ROTATE)
+fn rotate(i_dst: u32, out0: f32, out1: f32) {
+    dst[i_dst] = {{TYPE}}(out0);
+    dst[i_dst + pair_offset()] = {{TYPE}}(out1);
+}
+#enddecl(ROTATE)
 
+#decl(ROTATE_INPLACE)
+fn rotate(i_dst: u32, out0: f32, out1: f32) {
+    src0[i_dst] = {{TYPE}}(out0);
+    src0[i_dst + pair_offset()] = {{TYPE}}(out1);
+}
+#enddecl(ROTATE_INPLACE)
+
+#decl(NO_FF_FUNC)
 fn freq_factor(i: u32) -> f32 {
     return 1.0f;
 }
+#enddecl(NO_FF_FUNC)
+
+#decl(FF_FUNC)
+fn freq_factor(i: u32) -> f32 {
+    return src2[i/2];
+}
+#enddecl(FF_FUNC)
+
+#decl(NO_FF_BINDINGS)
 
 @group(0) @binding(2)
 var<storage, read_write> dst: array<{{TYPE}}>;
 
 @group(0) @binding(3)
 var<uniform> params: Params;
 
-#enddecl(NO_FREQ_FAC)
+#enddecl(NO_FF_BINDINGS)
 
-#decl(FREQ_FAC)
+#decl(NO_FF_BINDINGS_INPLACE)
 
-fn freq_factor(i: u32) -> f32 {
-    return src2[i/2];
-}
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+#enddecl(NO_FF_BINDINGS_INPLACE)
+
+#decl(FF_BINDINGS)
 
 @group(0) @binding(2)
 var<storage, read_write> src2: array<f32>;
@@ -64,7 +146,29 @@ var<storage, read_write> dst: array<{{TYPE}}>;
 @group(0) @binding(4)
 var<uniform> params: Params;
 
-#enddecl(FREQ_FAC)
+#enddecl(FF_BINDINGS)
+
+#decl(FF_BINDINGS_INPLACE)
+
+@group(0) @binding(2)
+var<storage, read_write> src2: array<f32>;
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+#enddecl(FF_BINDINGS_INPLACE)
+
+#decl(NORM)
+fn pair_offset() -> u32 {
+    return 1;
+}
+#enddecl(NORM)
+
+#decl(NEOX)
+fn pair_offset() -> u32 {
+    return params.n_dims / 2;
+}
+#enddecl(NEOX)
 
 #end(DECLS)
 
@@ -146,18 +250,16 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0;
 
     if (i0 >= params.n_dims) {
-        dst[i_dst] = src0[i_src];
-        dst[i_dst + 1] = src0[i_src + 1];
+        rotate(i_dst, f32(src0[i_src]), f32(src0[i_src + 1]));
         return;
     }
 
     let theta_base = f32(src1[params.offset_src1 + i2]) * pow(params.theta_scale, f32(i0)/2.0f);
     let thetas = rope_yarn(theta_base/freq_factor(i0), i0);
 
     let x0 = f32(src0[i_src]);
-    let x1 = f32(src0[i_src + 1]);
-    dst[i_dst] = {{TYPE}}(x0 * thetas.x - x1 * thetas.y);
-    dst[i_dst + 1] = {{TYPE}}(x0 * thetas.y + x1 * thetas.x);
+    let x1 = f32(src0[i_src + pair_offset()]);
+    rotate(i_dst, x0 * thetas.x - x1 * thetas.y, x0 * thetas.y + x1 * thetas.x);
 }
 
 #end(SHADER)
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp