Replace unneeded use of template arg

ORippler · ORippler · commit 7c3c4545554d · 2025-08-05T03:22:48.000-07:00
This will allow us to avoid compiling the kernel multiple times
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -40,7 +40,7 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
     sum = 0.0f;
     if constexpr (width > WARP_SIZE) {
         static_assert((width <= 1024) && (width % WARP_SIZE) == 0, "unexpected block_size");
-        if (lane_id < (width / WARP_SIZE)) {
+        if (lane_id < (blockDim.x / WARP_SIZE)) {
             sum = s_sum[lane_id];
         }
         sum = warp_reduce_sum(sum);

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r`
`40`	`40`	`sum = 0.0f;`
`41`	`41`	`if constexpr (width > WARP_SIZE) {`
`42`	`42`	`static_assert((width <= 1024) && (width % WARP_SIZE) == 0, "unexpected block_size");`
`43`		`- if (lane_id < (width / WARP_SIZE)) {`
	`43`	`+ if (lane_id < (blockDim.x / WARP_SIZE)) {`
`44`	`44`	`sum = s_sum[lane_id];`
`45`	`45`	`}`
`46`	`46`	`sum = warp_reduce_sum(sum);`