Cleanup code

reeselevine · reeselevine · commit 51252f024391 · 2025-08-19T15:35:45.000-07:00
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -50,13 +50,6 @@ static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) {
 
 /* Struct definitions */
 
-struct webgpu_pipeline_info {
-    std::string  name;
-    const char * shader_code;
-    ggml_type    src0_type;
-    ggml_type    src1_type;
-};
-
 // Forward reference
 static void ggml_webgpu_create_buffer(wgpu::Device &    device,
                                       wgpu::Buffer &    buffer,
@@ -571,12 +564,12 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
         (uint32_t) dst->ne[1],                                  // number of rows in result (M)
         (uint32_t) dst->ne[0],                                  // number of columns in result (N)
         (uint32_t) src0->ne[0],                                 // number of columns in src0/src1 (K)
-        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 1
-        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 1
-        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 2
-        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 2
-        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 3
-        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 3
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),  // stride (elements/blocks) of src0 in dimension 1
+        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),  // stride (elements/blocks) of src1 in dimension 1
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),  // stride (elements/blocks) of src0 in dimension 2
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),  // stride (elements/blocks) of src1 in dimension 2
+        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),  // stride (elements/blocks) of src0 in dimension 3
+        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),  // stride (elements/blocks) of src1 in dimension 3
         (uint32_t) src0->ne[2],                                 // batch size in dimension 2
         (uint32_t) src0->ne[3],                                 // batch size in dimension 3
         (uint32_t) (src1->ne[2] / src0->ne[2]),                 // broadcast in dimension 2
@@ -596,16 +589,11 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
          .buffer  = ggml_webgpu_tensor_buf(dst),
          .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
          .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  },
-        //         { .binding = 3,
-        //           .buffer  = ctx->debug_dev_buf,
-        //           .offset  = 0,
-        //           .size    = ctx->debug_dev_buf.GetSize() }
     };
 
     uint32_t wg_x =
         (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
-    //ggml_backend_webgpu_debug(ctx);
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
@@ -915,103 +903,94 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
 }
 
 static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
-    webgpu_pipeline_info pipeline_infos[22] = {
-        { .name        = "mul_mat_f32_f32",
-         .shader_code = wgsl_mul_mat_f32_f32,
-         .src0_type   = GGML_TYPE_F32,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_f16_f16",
-         .shader_code = wgsl_mul_mat_f16_f16,
-         .src0_type   = GGML_TYPE_F16,
-         .src1_type   = GGML_TYPE_F16 },
-        { .name        = "mul_mat_f16_f32",
-         .shader_code = wgsl_mul_mat_f16_f32,
-         .src0_type   = GGML_TYPE_F16,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q4_0_f32",
-         .shader_code = wgsl_mul_mat_q4_0_f32,
-         .src0_type   = GGML_TYPE_Q4_0,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q4_1_f32",
-         .shader_code = wgsl_mul_mat_q4_1_f32,
-         .src0_type   = GGML_TYPE_Q4_1,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q5_0_f32",
-         .shader_code = wgsl_mul_mat_q5_0_f32,
-         .src0_type   = GGML_TYPE_Q5_0,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q5_1_f32",
-         .shader_code = wgsl_mul_mat_q5_1_f32,
-         .src0_type   = GGML_TYPE_Q5_1,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q8_0_f32",
-         .shader_code = wgsl_mul_mat_q8_0_f32,
-         .src0_type   = GGML_TYPE_Q8_0,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q2_k_f32",
-         .shader_code = wgsl_mul_mat_q2_k_f32,
-         .src0_type   = GGML_TYPE_Q2_K,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q3_k_f32",
-         .shader_code = wgsl_mul_mat_q3_k_f32,
-         .src0_type   = GGML_TYPE_Q3_K,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q4_k_f32",
-         .shader_code = wgsl_mul_mat_q4_k_f32,
-         .src0_type   = GGML_TYPE_Q4_K,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q5_k_f32",
-         .shader_code = wgsl_mul_mat_q5_k_f32,
-         .src0_type   = GGML_TYPE_Q5_K,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_q6_k_f32",
-         .shader_code = wgsl_mul_mat_q6_k_f32,
-         .src0_type   = GGML_TYPE_Q6_K,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq2_xxs_f32",
-         .shader_code = wgsl_mul_mat_iq2_xxs_f32,
-         .src0_type   = GGML_TYPE_IQ2_XXS,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq2_xs_f32",
-         .shader_code = wgsl_mul_mat_iq2_xs_f32,
-         .src0_type   = GGML_TYPE_IQ2_XS,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq2_s_f32",
-         .shader_code = wgsl_mul_mat_iq2_s_f32,
-         .src0_type   = GGML_TYPE_IQ2_S,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq3_xxs_f32",
-         .shader_code = wgsl_mul_mat_iq3_xxs_f32,
-         .src0_type   = GGML_TYPE_IQ3_XXS,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq3_s_f32",
-         .shader_code = wgsl_mul_mat_iq3_s_f32,
-         .src0_type   = GGML_TYPE_IQ3_S,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq1_s_f32",
-         .shader_code = wgsl_mul_mat_iq1_s_f32,
-         .src0_type   = GGML_TYPE_IQ1_S,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq1_m_f32",
-         .shader_code = wgsl_mul_mat_iq1_m_f32,
-         .src0_type   = GGML_TYPE_IQ1_M,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq4_nl_f32",
-         .shader_code = wgsl_mul_mat_iq4_nl_f32,
-         .src0_type   = GGML_TYPE_IQ4_NL,
-         .src1_type   = GGML_TYPE_F32 },
-        { .name        = "mul_mat_iq4_xs_f32",
-         .shader_code = wgsl_mul_mat_iq4_xs_f32,
-         .src0_type   = GGML_TYPE_IQ4_XS,
-         .src1_type   = GGML_TYPE_F32 }
-    };
-
-    for (auto & pipeline_info : pipeline_infos) {
-        ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                    webgpu_ctx->mul_mat_pipeline[pipeline_info.src0_type][pipeline_info.src1_type],
-                                    pipeline_info.shader_code,
-                                    pipeline_info.name.data());
-    }
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
+                                wgsl_mul_mat_f32_f32,
+                                "mul_mat_f32_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
+                                wgsl_mul_mat_f16_f16,
+                                "mul_mat_f16_f16");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
+                                wgsl_mul_mat_f16_f32,
+                                "mul_mat_f16_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_0_f32,
+                                "mul_mat_q4_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_1_f32,
+                                "mul_mat_q4_1_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_0_f32,
+                                "mul_mat_q5_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_1_f32,
+                                "mul_mat_q5_1_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q8_0_f32,
+                                "mul_mat_q8_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q2_k_f32,
+                                "mul_mat_q2_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q3_k_f32,
+                                "mul_mat_q3_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_k_f32,
+                                "mul_mat_q4_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_k_f32,
+                                "mul_mat_q5_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q6_k_f32,
+                                "mul_mat_q6_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_xxs_f32,
+                                "mul_mat_iq2_xxs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_xs_f32,
+                                "mul_mat_iq2_xs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_s_f32,
+                                "mul_mat_iq2_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq3_xxs_f32,
+                                "mul_mat_iq3_xxs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq3_s_f32,
+                                "mul_mat_iq3_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq1_s_f32,
+                                "mul_mat_iq1_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq1_m_f32,
+                                "mul_mat_iq1_m_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq4_nl_f32,
+                                "mul_mat_iq4_nl_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device,
+                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq4_xs_f32,
+                                "mul_mat_iq4_xs_f32");
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -18,7 +18,8 @@ def parse_decls(decls_text):
 
 def replace_placeholders(shader_text, replacements):
     for key, val in replacements.items():
-        pattern = rf'\b{re.escape(key)}\b'
+        # Match {{KEY}} literally, where KEY is escaped
+        pattern = r'{{\s*' + re.escape(key) + r'\s*}}'
         shader_text = re.sub(pattern, str(val), shader_text)
     return shader_text
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
@@ -1731,13 +1731,13 @@ enable f16;
 DECLS
 
 struct MulMatParams {
-    offset_src0: u32, // in elements
-    offset_src1: u32, // in elements
-    offset_dst: u32, // in elements
+    offset_src0: u32, // in elements/blocks
+    offset_src1: u32, // in elements/blocks
+    offset_dst: u32, // in elements/blocks
     m: u32,
     n: u32,
     k: u32,
-    // all strides are in elements
+    // all strides are in elements/blocks
     stride_01: u32,
     stride_11: u32,
     stride_02: u32,
@@ -1751,10 +1751,9 @@ struct MulMatParams {
     broadcast3: u32
 };
 
-@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>; // N rows, K columns
-@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>; // M rows, K columns (transposed)
+@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // N rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // M rows, K columns (transposed)
 @group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
-//@group(0) @binding(3) var<storage, read_write> debug: array<f32>;
 
 @group(0) @binding(3) var<uniform> params: MulMatParams;
 
@@ -1786,7 +1785,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11;
 
     var sum = 0.0;
-    for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) {
+    for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) {
         sum += multiply_add(src0_idx_base, src1_idx_base, i);
     }
     dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -979,15 +979,6 @@ struct test_case {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
             init_tensor_uniform(t);
         }
-        // print first 32 elements of each tensor
-//        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-//            if (strcmp(ggml_get_name(t), "a") == 0) {
-//                std::vector<float> values = tensor_to_float(t);
-//                for (int i = 0; i < 32; i++) {
-//                    printf("%s[%d] = %f\n", ggml_get_name(t), i, values[i]);
-//                }
-//            }
-//        }
     }
 
     virtual size_t op_size(ggml_tensor * t) {