ggml-org
diff --git a/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 22 additions & 26 deletions b/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 22 additions & 26 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl‎
Lines changed: 0 additions & 44 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl‎
Lines changed: 0 additions & 44 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl‎
Lines changed: 0 additions & 41 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl‎
Lines changed: 124 additions & 0 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py‎
Lines changed: 7 additions & 4 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl‎
Lines changed: 0 additions & 44 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl‎
Lines changed: 0 additions & 44 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl‎
Lines changed: 0 additions & 41 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl‎
Lines changed: 0 additions & 41 deletions
@@ -696,23 +696,19 @@ static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_t
 
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
+        (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
+        (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
+        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+        (uint32_t) src->ne[0],
+        (uint32_t) src->ne[1],
+        (uint32_t) src->ne[2],
+        (uint32_t) src->ne[3],
+        *(uint32_t *) dst->op_params  // epsilon, treated as f32 in the shader
     };
-    if (!in_place) {
-        params.push_back((uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)));
-    }
-    params.push_back((uint32_t) (src->nb[1] / ggml_type_size(src->type)));
-    params.push_back((uint32_t) (src->nb[2] / ggml_type_size(src->type)));
-    params.push_back((uint32_t) (src->nb[3] / ggml_type_size(src->type)));
-    if (!in_place) {
-        params.push_back((uint32_t) (dst->nb[1] / ggml_type_size(dst->type)));
-        params.push_back((uint32_t) (dst->nb[2] / ggml_type_size(dst->type)));
-        params.push_back((uint32_t) (dst->nb[3] / ggml_type_size(dst->type)));
-    }
-    params.push_back((uint32_t) src->ne[0]);
-    params.push_back((uint32_t) src->ne[1]);
-    params.push_back((uint32_t) src->ne[2]);
-    params.push_back((uint32_t) src->ne[3]);
-    params.push_back(*(uint32_t *) dst->op_params);  // epsilon, treated as f32 in the shader
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
@@ -1266,10 +1262,10 @@ static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
                                 constants);
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16",
                                 constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F32], wgsl_add_in_place_f32,
-                                "add_in_place_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F16], wgsl_add_in_place_f16,
-                                "add_in_place_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F32], wgsl_add_f32_inplace,
+                                "add_f32_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F16], wgsl_add_f16_inplace,
+                                "add_f16_inplace", constants);
 }
 
 static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
@@ -1278,18 +1274,18 @@ static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
                                 constants);
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16], wgsl_mul_f16, "mul_f16",
                                 constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F32], wgsl_mul_in_place_f32,
-                                "mul_in_place_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F16], wgsl_mul_in_place_f16,
-                                "mul_in_place_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F32], wgsl_mul_f32_inplace,
+                                "mul_f32_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F16], wgsl_mul_f16_inplace,
+                                "mul_f16_inplace", constants);
 }
 
 static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline, wgsl_rms_norm, "rms_norm",
                                 constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_ip_pipeline, wgsl_rms_norm_in_place,
-                                "rms_norm_in_place", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_ip_pipeline, wgsl_rms_norm_inplace,
+                                "rms_norm_inplace", constants);
 }
 
 static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
 
@@ -0,0 +1,124 @@
+#define(VARIANTS)
+
+[
+  {
+    "SHADER_NAME": "add_f32",
+    "REPLS": {
+      "TYPE" : "f32",
+      "OP": "+"
+    },
+    "DECLS": ["NOT_INPLACE"]
+  },
+  {
+    "SHADER_NAME": "add_f16",
+    "REPLS": {
+      "TYPE" : "f16",
+      "OP": "+"
+    },
+    "DECLS": ["NOT_INPLACE"]
+  },
+  {
+    "SHADER_NAME": "add_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "OP": "+"
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "add_f16_inplace",
+    "REPLS": {
+      "TYPE" : "f16",
+      "OP": "+"
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "mul_f32",
+    "REPLS": {
+      "TYPE" : "f32",
+      "OP": "*"
+    },
+    "DECLS": ["NOT_INPLACE"]
+  },
+  {
+    "SHADER_NAME": "mul_f16",
+    "REPLS": {
+      "TYPE" : "f16",
+      "OP": "*"
+    },
+    "DECLS": ["NOT_INPLACE"]
+  },
+  {
+    "SHADER_NAME": "mul_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "OP": "*"
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "mul_f16_inplace",
+    "REPLS": {
+      "TYPE" : "f16",
+      "OP": "*"
+    },
+    "DECLS": ["INPLACE"]
+  }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(NOT_INPLACE)
+
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+    dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
+}
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<{{TYPE}}>;
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+#enddecl(NOT_INPLACE)
+
+#decl(INPLACE)
+
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+    src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
+}
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+#enddecl(INPLACE)
+
+#end(DECLS)
+
+
+#define(SHADER)
+
+enable f16;
+
+#include "binary_head.tmpl"
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
+
+DECLS
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
+    }
+}
+
+#end(SHADER)
@@ -89,14 +89,17 @@ def generate_variants(fname, input_dir, output_dir, outfile):
                 decls_code += decls_map[key] + "\n\n"
 
             final_shader = re.sub(r'\bDECLS\b', decls_code, shader_template)
-            final_shader = replace_placeholders(final_shader, variant["REPLS"])
+            if "REPLS" in variant:
+                final_shader = replace_placeholders(final_shader, variant["REPLS"])
             final_shader = expand_includes(final_shader, input_dir)
 
-            if "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
-                output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
+            if "SHADER_NAME" in variant:
+                output_name = variant["SHADER_NAME"]
             elif "SHADER_SUFFIX" in variant:
                 output_name = f"{shader_base_name}_" + variant["SHADER_SUFFIX"]
-            elif "TYPE" in variant["REPLS"]:
+            elif "REPLS" in variant and "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
+                output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
+            elif "REPLS" in variant and "TYPE" in variant["REPLS"]:
                 output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"]
             else:
                 output_name = shader_base_name