Revert "feat: Parallel sum in SSM_CONV"

gabe-l-hart · gabe-l-hart · commit e55176a0ddde · 2025-07-23T06:54:30.000-06:00
After discussion with @compilade, the size of the parallelism here is not worth the cost in complexity or overhead of the parallel for. ggml-org#14743 (comment) This reverts commit 16bc059. Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -2909,26 +2909,7 @@ static bool ggml_metal_encode_node(
                 [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
                 [encoder setBytes:&args    length:sizeof(args) atIndex:3];
 
-                const int64_t d_state = ne10;
-
-                // One shared memory bucket for each simd group in the threadgroup
-                if (d_state >= 32) {
-                    const int64_t shmem_size = d_state / 32;
-
-                    // The final simd_sum won't work if the number of simd groups is
-                    // larger than the size of a single simd group. If this case is
-                    // hit at some point, the logic in the second simd_sum could be
-                    // expanded to handle this with one more sequential simd_sum to
-                    // collapse simd group sums another time.
-                    GGML_ASSERT(shmem_size <= 32);
-
-                    // One thread pre element in d_state
-                    GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup);
-
-                    [encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0];
-                }
-
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
         case GGML_OP_SSM_SCAN:
             {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1663,16 +1663,10 @@ kernel void kernel_ssm_conv_f32(
         device const  void * src0,
         device const  void * src1,
         device       float * dst,
-        threadgroup  float * shared [[threadgroup(0)]],
         constant ggml_metal_kargs_ssm_conv & args,
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        uint3  tpitg[[thread_position_in_threadgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgptg[[simdgroups_per_threadgroup]],
-        uint3   tgpg[[threadgroups_per_grid]]) {
-
-    const int64_t i0 = tpitg.x;
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
     const int64_t ir = tgpig.x;
     const int64_t i2 = tgpig.y;
     const int64_t i3 = tgpig.z;
@@ -1687,31 +1681,13 @@ kernel void kernel_ssm_conv_f32(
     device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
     device       float * x = (device       float *) ((device       char *) dst  + ir*args.nb0  + i2*args.nb1  + i3*args.nb2);
 
-    float sumf = s[i0] * c[i0];
-
-    // Parallel sum: first sum over threads in simd group, then sum over simd
-    // group sums
-    sumf = simd_sum(sumf);
+    float sumf = 0.0f;
 
-    // If multiple simd groups per threadgroup, sum over simd group sums
-    if (sgptg > 1) {
-        if (tiisg == 0) {
-            shared[sgitg] = sumf;
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        sumf = 0.0f;
-        if (sgitg == 0) {
-            if (tiisg < sgptg) {
-                sumf = shared[tiisg];
-            }
-            sumf = simd_sum(sumf);
-            if (tiisg == 0) {
-                x[0] = sumf;
-            }
-        }
-    } else if (tiisg == 0) {
-        x[0] = sumf;
+    for (int64_t i0 = 0; i0 < nc; ++i0) {
+        sumf += s[i0] * c[i0];
     }
+
+    x[0] = sumf;
 }
 
 // ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-1 part