Ensure perf gains also for small ncols and large nrows

ORippler · ORippler · commit 8e04242c8420 · 2025-08-07T00:18:30.000-07:00
Alternative to this, one could have also made the number of unrollings
template-able, but that would require compiling the kernel multiple
times, increasing binary size unnecessarily
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
@@ -19,7 +19,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         constexpr dim3 block_dims(512, 1, 1);
         reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     } else {
-        constexpr dim3 block_dims(128, 1, 1);
+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
         reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     }
 }
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
@@ -7,7 +7,7 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
         const dim3 block_dims(512, 1, 1);
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
     } else {
-        const dim3 block_dims(128, 1, 1);
+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
     }
 }
@@ -33,7 +33,7 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     } else {
         // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
-        const dim3 block_dims(128, 1, 1);
+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`19`	`19`	`constexpr dim3 block_dims(512, 1, 1);`
`20`	`20`	`reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`21`	`21`	`} else {`
`22`		`- constexpr dim3 block_dims(128, 1, 1);`
	`22`	`+ const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);`
`23`	`23`	`reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`24`	`24`	`}`
`25`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int`
`7`	`7`	`const dim3 block_dims(512, 1, 1);`
`8`	`8`	`reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
`9`	`9`	`} else {`
`10`		`- const dim3 block_dims(128, 1, 1);`
	`10`	`+ const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);`
`11`	`11`	`reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
`12`	`12`	`}`
`13`	`13`	`}`
`@@ -33,7 +33,7 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`33`	`33`	`reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`34`	`34`	`} else {`
`35`	`35`	`// Enough active SMs to hide latency, use smaller blocks to allow better scheduling`
`36`		`- const dim3 block_dims(128, 1, 1);`
	`36`	`+ const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);`
`37`	`37`	`reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`38`	`38`	`}`
`39`	`39`	`}`