Move shared memory inside code executed for >1 warp

ORippler · ORippler · commit 9e87786f1274 · 2025-08-05T07:19:27.000-07:00
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -30,15 +30,15 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
 
     // sum up partial sums
     sum = warp_reduce_sum(sum);
-    __shared__ float s_sum[32];
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-    if (lane_id == 0) {
-        s_sum[warp_id] = sum;
-    }
-    __syncthreads();
-    sum = 0.0f;
     if constexpr (width > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = sum;
+        }
+        __syncthreads();
+        sum = 0.0f;
         static_assert((width <= 1024) && (width % WARP_SIZE) == 0, "unexpected block_size");
         if (lane_id < (blockDim.x / WARP_SIZE)) {
             sum = s_sum[lane_id];