add dynamic memory allocation in metal

iliailmer · iliailmer · commit 9f3e11c7981f · 2025-10-16T08:59:43.000-04:00
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3094,8 +3094,6 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     const int32_t KH = op->src[0]->ne[1];
     const int32_t KW = op->src[0]->ne[0];
 
-    GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit");
-
     const int32_t OW = op->ne[0];
     const int32_t OH = op->ne[1];
     const int32_t OC = op->ne[2];
@@ -3121,6 +3119,10 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
+    // Metal requires buffer size to be multiple of 16 bytes
+    const size_t smem = GGML_PAD(KW * KH * sizeof(float), 16);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+
     ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
 
     return 1;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4146,6 +4146,7 @@ kernel void kernel_conv_transpose_2d(
         device const T * src0,
         device const float * src1,
         device        char * dst,
+        threadgroup float * shared_sum [[threadgroup(0)]],
         uint3   tgpig[[threadgroup_position_in_grid]],
         uint3   tpitg[[thread_position_in_threadgroup]],
         uint3     ntg[[threads_per_threadgroup]]) {
@@ -4182,7 +4183,6 @@ kernel void kernel_conv_transpose_2d(
         v += (float)src0[kernel_idx] * src1[input_idx];
     }
 
-    threadgroup float shared_sum[64];
     const uint tid = tpitg.y * ntg.x + tpitg.x;
     shared_sum[tid] = v;
 
@@ -4206,6 +4206,7 @@ kernel void kernel_conv_transpose_2d<float>(
     device const float * src0,
     device const float * src1,
     device        char * dst,
+    threadgroup float * shared_sum [[threadgroup(0)]],
     uint3   tgpig[[threadgroup_position_in_grid]],
     uint3   tpitg[[thread_position_in_threadgroup]],
     uint3     ntg[[threads_per_threadgroup]]);
@@ -4216,6 +4217,7 @@ kernel void kernel_conv_transpose_2d<half>(
     device const half  * src0,
     device const float * src1,
     device        char * dst,
+    threadgroup float * shared_sum [[threadgroup(0)]],
     uint3   tgpig[[threadgroup_position_in_grid]],
     uint3   tpitg[[thread_position_in_threadgroup]],
     uint3     ntg[[threads_per_threadgroup]]);