Remove unnecessary template argument from reduce_rows_f32

ORippler · ORippler · commit 278c832d588a · 2025-08-06T03:13:22.000-07:00
Checking at run-time does not incur any performance overhead
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
@@ -17,9 +17,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const dim3 block_nums(nrows, 1, 1);
     if ((nrows / ctx.sm_count)< 2){
         constexpr dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     } else {
         constexpr dim3 block_dims(128, 1, 1);
-        reduce_rows_f32</*norm=*/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     }
 }
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -1,7 +1,7 @@
 #include "common.cuh"
 
 // Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
-template<bool norm, int width = WARP_SIZE>
+template<bool norm>
 static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
     const int row = blockIdx.x;
     const int col = threadIdx.x;
@@ -30,7 +30,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
 
     // sum up partial sums
     sum = warp_reduce_sum(sum);
-    if constexpr (width > WARP_SIZE) {
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
         __shared__ float s_sum[32];
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
@@ -39,7 +40,6 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
         }
         __syncthreads();
         sum = 0.0f;
-        static_assert((width <= 1024) && (width % WARP_SIZE) == 0, "unexpected block_size");
         if (lane_id < (blockDim.x / WARP_SIZE)) {
             sum = s_sum[lane_id];
         }
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
@@ -5,10 +5,10 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
     const dim3 block_nums(nrows, 1, 1);
     if ((nrows / n_sm)< 2){
         const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
     } else {
         const dim3 block_dims(128, 1, 1);
-        reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
     }
 }
 
@@ -30,10 +30,10 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if ((nrows / ctx.sm_count)< 2){
         // Increase num threads to 512 for small nrows to better hide the latency
         const dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     } else {
         // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
         const dim3 block_dims(128, 1, 1);
-        reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -17,9 +17,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`17`	`17`	`const dim3 block_nums(nrows, 1, 1);`
`18`	`18`	`if ((nrows / ctx.sm_count)< 2){`
`19`	`19`	`constexpr dim3 block_dims(512, 1, 1);`
`20`		`- reduce_rows_f32</norm=/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`20`	`+ reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`21`	`21`	`} else {`
`22`	`22`	`constexpr dim3 block_dims(128, 1, 1);`
`23`		`- reduce_rows_f32</norm=/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`23`	`+ reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`24`	`24`	`}`
`25`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,10 +5,10 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int`
`5`	`5`	`const dim3 block_nums(nrows, 1, 1);`
`6`	`6`	`if ((nrows / n_sm)< 2){`
`7`	`7`	`const dim3 block_dims(512, 1, 1);`
`8`		`- reduce_rows_f32</norm=/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
	`8`	`+ reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
`9`	`9`	`} else {`
`10`	`10`	`const dim3 block_dims(128, 1, 1);`
`11`		`- reduce_rows_f32</norm=/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
	`11`	`+ reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);`
`12`	`12`	`}`
`13`	`13`	`}`
`14`	`14`
`@@ -30,10 +30,10 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`30`	`30`	`if ((nrows / ctx.sm_count)< 2){`
`31`	`31`	`// Increase num threads to 512 for small nrows to better hide the latency`
`32`	`32`	`const dim3 block_dims(512, 1, 1);`
`33`		`- reduce_rows_f32</norm=/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`33`	`+ reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`34`	`34`	`} else {`
`35`	`35`	`// Enough active SMs to hide latency, use smaller blocks to allow better scheduling`
`36`	`36`	`const dim3 block_dims(128, 1, 1);`
`37`		`- reduce_rows_f32</norm=/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`37`	`+ reduce_rows_f32</norm=/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`38`	`38`	`}`
`39`	`39`	`}`