Vectorize f32 and change default workgroup size

reeselevine · reeselevine · commit 456178444339 · 2025-09-15T12:57:27.000-07:00
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -129,6 +129,7 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline mul_mat_pipeline[30][2];
     wgpu::ComputePipeline set_rows_pipeline;
     wgpu::ComputePipeline get_rows_pipeline[30];
+    wgpu::ComputePipeline get_rows_f32_no_vec_pipeline;
     wgpu::ComputePipeline cpy_pipeline;
     wgpu::ComputePipeline add_pipeline[2];
     wgpu::ComputePipeline add_ip_pipeline[2];
@@ -595,8 +596,11 @@ static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size;
 
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->get_rows_pipeline[src->type], params, entries, wg_x,
-                                          ggml_op_name(dst->op));
+    wgpu::ComputePipeline pipeline = ctx->get_rows_pipeline[src->type];
+    if (src->type == GGML_TYPE_F32 && dst->ne[0] < 4) {
+        pipeline = ctx->get_rows_f32_no_vec_pipeline;
+    }
+    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
 static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
@@ -1117,7 +1121,9 @@ static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
 
 static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F32], wgsl_get_rows_f32,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F32], wgsl_get_rows_f32_vec,
+                                "get_rows_f32_vec", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_f32_no_vec_pipeline, wgsl_get_rows_f32,
                                 "get_rows_f32", constants);
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F16], wgsl_get_rows_f16,
                                 "get_rows_f16", constants);
@@ -1423,7 +1429,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(ctx->adapter != nullptr);
 
     ctx->adapter.GetLimits(&ctx->limits);
-    ctx->max_wg_size_x = 256;  // default value
+    ctx->max_wg_size_x = 288;  // default value
 
     wgpu::AdapterInfo info{};
     ctx->adapter.GetInfo(&info);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -92,6 +92,8 @@ def generate_variants(fname, input_dir, output_dir, outfile):
 
             if "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
                 output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
+            elif "TYPE_SUFFIX" in variant["REPLS"]:
+                output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE_SUFFIX"]
             elif "TYPE" in variant["REPLS"]:
                 output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"]
             else:
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl
@@ -1,156 +1,187 @@
 #define(VARIANTS)
 
 [
+  {
+    "REPLS": {
+      "TYPE" : "vec4<f32>",
+      "TYPE_SUFFIX": "f32_vec",
+      "DST_TYPE": "vec4<f32>",
+      "BLOCK_SIZE": 4
+    },
+    "DECLS": ["F32_VEC"]
+  },
   {
     "REPLS": {
       "TYPE" : "f32",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 1
     },
-    "DECLS": ["FLOAT"]
+    "DECLS": ["F32"]
   },
   {
     "REPLS": {
       "TYPE" : "f16",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 1
     },
-    "DECLS": ["FLOAT"]
+    "DECLS": ["F16"]
   },
   {
     "REPLS": {
       "TYPE" : "i32",
+      "DST_TYPE": "i32",
       "BLOCK_SIZE": 1
     },
-    "DECLS": ["FLOAT"]
+    "DECLS": ["I32"]
   },
   {
     "REPLS": {
       "TYPE" : "q4_0",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32
     },
     "DECLS": ["BYTE_HELPERS", "Q4_0_T", "Q4_0"]
   },
   {
     "REPLS": {
       "TYPE" : "q4_1",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32
     },
     "DECLS": ["BYTE_HELPERS", "Q4_1_T", "Q4_1"]
   },
   {
     "REPLS": {
       "TYPE" : "q5_0",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32
     },
     "DECLS": ["BYTE_HELPERS", "Q5_0_T", "Q5_0"]
   },
   {
     "REPLS": {
       "TYPE" : "q5_1",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32
     },
     "DECLS": ["BYTE_HELPERS", "Q5_1_T", "Q5_1"]
   },
   {
     "REPLS": {
       "TYPE" : "q8_0",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32
     },
     "DECLS": ["BYTE_HELPERS", "Q8_0_T", "Q8_0"]
   },
   {
     "REPLS": {
       "TYPE" : "q2_k",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "Q2_K_T", "Q2_K"]
   },
   {
     "REPLS": {
       "TYPE" : "q3_k",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "Q3_K_T", "Q3_K"]
   },
   {
     "REPLS": {
       "TYPE" : "q4_k",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K_T", "Q4_K"]
   },
   {
     "REPLS": {
       "TYPE" : "q5_k",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K_T", "Q5_K"]
   },
   {
     "REPLS": {
       "TYPE" : "q6_k",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "Q6_K_T", "Q6_K"]
   },
   {
     "REPLS": {
       "TYPE" : "iq2_xxs",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS_GRID", "IQ2_XXS_T", "IQ2_XXS"]
   },
   {
     "REPLS": {
       "TYPE" : "iq2_xs",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS_GRID", "IQ2_XS_T", "IQ2_XS"]
   },
   {
     "REPLS": {
       "TYPE": "iq2_s",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S_GRID", "IQ2_S_T", "IQ2_S"]
   },
   {
     "REPLS": {
       "TYPE": "iq3_xxs",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS_GRID", "IQ3_XSS_T", "IQ3_XSS"]
   },
   {
     "REPLS": {
       "TYPE": "iq3_s",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S_GRID", "IQ3_S_T", "IQ3_S"]
   },
   {
     "REPLS": {
       "TYPE": "iq1_s",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_S_T", "IQ1_S"]
   },
   {
     "REPLS": {
       "TYPE": "iq1_m",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256
     },
     "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_M_T", "IQ1_M"]
   },
   {
     "REPLS": {
       "TYPE": "iq4_nl",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 32,
     },
     "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_NL_T", "IQ4_NL"]
   },
   {
     "REPLS": {
       "TYPE": "iq4_xs",
+      "DST_TYPE": "f32",
       "BLOCK_SIZE": 256,
     },
     "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_XS_T", "IQ4_XS"]
@@ -161,11 +192,29 @@
 
 #define(DECLS)
 
-#decl(FLOAT)
+#decl(F32_VEC)
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    dst[(dst_base / 4) + offset] = src[(src_base / 4) + offset];
+}
+#enddecl(F32_VEC)
+
+#decl(F32)
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    dst[dst_base + offset] = src[src_base + offset];
+}
+#enddecl(F32)
+
+#decl(F16)
 fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
     dst[dst_base + offset] = f32(src[src_base + offset]);
 }
-#enddecl(FLOAT)
+#enddecl(F16)
+
+#decl(I32)
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    dst[dst_base + offset] = src[src_base + offset];
+}
+#enddecl(I32)
 
 #decl(Q4_0)
 fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
@@ -759,7 +808,7 @@ var<storage, read_write> src: array<{{TYPE}}>;
 var<storage, read_write> idx: array<i32>;
 
 @group(0) @binding(2)
-var<storage, read_write> dst: array<f32>;
+var<storage, read_write> dst: array<{{DST_TYPE}}>;
 
 struct Params {
     offset_src: u32, // in elements
@@ -822,4 +871,4 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     }
 }
 
-#end(SHADER)
+#end(SHADER)