Further optimizations to reduce_rows_f32

ORippler · ORippler · commit ece608a15e46 · 2025-08-07T00:13:49.000-07:00
1. Increase threadblock size to better hide latency of memory requests.
   As a consequence of bigger threadblocks, do 2-step summation, using
   shared memory to communicate results between invocations
2. Use sum_temp array to reduce waits on sum
3. Adjust num_unroll to reflext bigger threadblock
4. Improve default block_dims, increase support for more block_dims
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
@@ -14,7 +14,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t ncols = src0->ne[0];
     const int64_t nrows = ggml_nrows(src0);
 
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    constexpr dim3 block_dims(512, 1, 1);
     const dim3 block_nums(nrows, 1, 1);
     reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
 }
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -7,8 +7,9 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
     const int col = threadIdx.x;
 
     float sum = 0.0f;
-    const int num_unroll = 24;
+    const int num_unroll = 8;
     float temp[num_unroll];
+    float sum_temp[num_unroll] = {0.0f};
     for (int i = col; i < ncols;) {
         for (int j = 0; j < num_unroll; ++j){
             if (i < ncols){
@@ -20,11 +21,30 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
             i += blockDim.x;
         }
         for (int j = 0; j < num_unroll; ++j){
-            sum += temp[j];
+            sum_temp[j] += temp[j];
         }
     }
+    for (int j = 0; j < num_unroll; ++j){
+            sum += sum_temp[j];
+    }
 
+    // sum up partial sums
     sum = warp_reduce_sum(sum);
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = sum;
+        }
+        __syncthreads();
+        sum = 0.0f;
+        if (lane_id < (blockDim.x / WARP_SIZE)) {
+            sum = s_sum[lane_id];
+        }
+        sum = warp_reduce_sum(sum);
+    }
 
     if (col != 0) {
         return;
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
@@ -2,7 +2,7 @@
 #include "reduce_rows.cuh"
 
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    constexpr dim3 block_dims(512, 1, 1);
     const dim3 block_nums(nrows, 1, 1);
     reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 }
@@ -20,7 +20,7 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t ncols = src0->ne[0];
     const int64_t nrows = ggml_nrows(src0);
 
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    constexpr dim3 block_dims(512, 1, 1);
     const dim3 block_nums(nrows, 1, 1);
 
     reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`14`	`14`	`const int64_t ncols = src0->ne[0];`
`15`	`15`	`const int64_t nrows = ggml_nrows(src0);`
`16`	`16`
`17`		`- const dim3 block_dims(WARP_SIZE, 1, 1);`
	`17`	`+ constexpr dim3 block_dims(512, 1, 1);`
`18`	`18`	`const dim3 block_nums(nrows, 1, 1);`
`19`	`19`	`reduce_rows_f32</norm/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`20`	`20`	`}`