Skip to content

Commit 278c832

Browse files
committed
Remove unnecessary template argument from reduce_rows_f32
Checking at run-time does not incur any performance overhead
1 parent 9c1f520 commit 278c832

File tree

3 files changed

+9
-9
lines changed

3 files changed

+9
-9
lines changed

ggml/src/ggml-cuda/mean.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1717
const dim3 block_nums(nrows, 1, 1);
1818
if ((nrows / ctx.sm_count)< 2){
1919
constexpr dim3 block_dims(512, 1, 1);
20-
reduce_rows_f32</*norm=*/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
20+
reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
2121
} else {
2222
constexpr dim3 block_dims(128, 1, 1);
23-
reduce_rows_f32</*norm=*/ true, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
23+
reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
2424
}
2525
}

ggml/src/ggml-cuda/reduce_rows.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "common.cuh"
22

33
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
4-
template<bool norm, int width = WARP_SIZE>
4+
template<bool norm>
55
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
66
const int row = blockIdx.x;
77
const int col = threadIdx.x;
@@ -30,7 +30,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
3030

3131
// sum up partial sums
3232
sum = warp_reduce_sum(sum);
33-
if constexpr (width > WARP_SIZE) {
33+
if (blockDim.x > WARP_SIZE) {
34+
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
3435
__shared__ float s_sum[32];
3536
const int warp_id = threadIdx.x / WARP_SIZE;
3637
const int lane_id = threadIdx.x % WARP_SIZE;
@@ -39,7 +40,6 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
3940
}
4041
__syncthreads();
4142
sum = 0.0f;
42-
static_assert((width <= 1024) && (width % WARP_SIZE) == 0, "unexpected block_size");
4343
if (lane_id < (blockDim.x / WARP_SIZE)) {
4444
sum = s_sum[lane_id];
4545
}

ggml/src/ggml-cuda/sumrows.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
55
const dim3 block_nums(nrows, 1, 1);
66
if ((nrows / n_sm)< 2){
77
const dim3 block_dims(512, 1, 1);
8-
reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
8+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
99
} else {
1010
const dim3 block_dims(128, 1, 1);
11-
reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
11+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
1212
}
1313
}
1414

@@ -30,10 +30,10 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
3030
if ((nrows / ctx.sm_count)< 2){
3131
// Increase num threads to 512 for small nrows to better hide the latency
3232
const dim3 block_dims(512, 1, 1);
33-
reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
33+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
3434
} else {
3535
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
3636
const dim3 block_dims(128, 1, 1);
37-
reduce_rows_f32</*norm=*/false, 128><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
37+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
3838
}
3939
}

0 commit comments

Comments
 (0)