add more tests, add optimization to threading

iliailmer · iliailmer · commit 7b6f66246e6d · 2025-10-15T22:00:06.000-04:00
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3094,6 +3094,8 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     const int32_t KH = op->src[0]->ne[1];
     const int32_t KW = op->src[0]->ne[0];
 
+    GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit");
+
     const int32_t OW = op->ne[0];
     const int32_t OH = op->ne[1];
     const int32_t OC = op->ne[2];
@@ -3119,7 +3121,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, 1, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4147,45 +4147,57 @@ kernel void kernel_conv_transpose_2d(
         device const float * src1,
         device        char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
-        uint3    tgpg[[threadgroups_per_grid]]) {
+        uint3   tpitg[[thread_position_in_threadgroup]],
+        uint3     ntg[[threads_per_threadgroup]]) {
 
     const int64_t out_x = tgpig[0];
     const int64_t out_y = tgpig[1];
     const int64_t out_c = tgpig[2];
 
+    const int64_t kw = tpitg[0];
+    const int64_t kh = tpitg[1];
+
     float v = 0.0f;
 
     for (int64_t in_c = 0; in_c < args.IC; in_c++) {
-        for (int64_t kh = 0; kh < args.KH; kh++) {
+        int64_t in_y = out_y - kh;
+
+        if (in_y < 0 || in_y % args.s0) continue;
 
-            int64_t in_y = out_y - kh;
+        in_y /= args.s0;
 
-            if (in_y < 0 || in_y % args.s0) continue;
+        if (in_y >= args.IH) continue;
 
-            in_y /= args.s0;
+        int64_t in_x = out_x - kw;
 
-            if (in_y >= args.IH) continue;
+        if (in_x < 0 || in_x % args.s0) continue;
 
-            for (int64_t kw = 0; kw < args.KW; kw++) {
-                int64_t in_x = out_x - kw;
+        in_x /= args.s0;
 
-                if (in_x < 0 || in_x % args.s0) continue;
+        if (in_x >= args.IW) continue;
 
-                in_x /= args.s0;
+        const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
+        const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
 
-                if (in_x >= args.IW) continue;
+        v += (float)src0[kernel_idx] * src1[input_idx];
+    }
 
-                const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
-                const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
+    threadgroup float shared_sum[64];
+    const uint tid = tpitg.y * ntg.x + tpitg.x;
+    shared_sum[tid] = v;
 
-                v += (float)src0[kernel_idx] * src1[input_idx];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-            }
+    if (tid == 0) {
+        float total = 0.0f;
+        const uint num_threads = ntg.x * ntg.y;
+        for (uint i = 0; i < num_threads; i++) {
+            total += shared_sum[i];
         }
-    }
-    device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
 
-    dst_ptr[0] = v;
+        device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
+        dst_ptr[0] = total;
+    }
 }
 
 template [[host_name("kernel_conv_transpose_2d_f32_f32")]]
@@ -4195,7 +4207,8 @@ kernel void kernel_conv_transpose_2d<float>(
     device const float * src1,
     device        char * dst,
     uint3   tgpig[[threadgroup_position_in_grid]],
-    uint3    tgpg[[threadgroups_per_grid]]);
+    uint3   tpitg[[thread_position_in_threadgroup]],
+    uint3     ntg[[threads_per_threadgroup]]);
 
 template [[host_name("kernel_conv_transpose_2d_f16_f32")]]
 kernel void kernel_conv_transpose_2d<half>(
@@ -4204,7 +4217,8 @@ kernel void kernel_conv_transpose_2d<half>(
     device const float * src1,
     device        char * dst,
     uint3   tgpig[[threadgroup_position_in_grid]],
-    uint3    tgpg[[threadgroups_per_grid]]);
+    uint3   tpitg[[thread_position_in_threadgroup]],
+    uint3     ntg[[threads_per_threadgroup]]);
 
 kernel void kernel_upscale_f32(
     constant ggml_metal_kargs_upscale & args,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -6952,6 +6952,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
 
     test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
 
     test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));