Skip to content

Commit 6028bf7

Browse files
authored
CUDA: Optimize reduce_rows_f32 kernel, leading up to 25x perf improvement on kernel-level and 10% perf increase for Gemma3n (#15132)
* Factor out `reduce_rows_f32` from common.cuh This increases iteration cycle speed by not having to recompile every kernel all the time * Hide memory-latency by loop unrolling in reduce_rows_f32 * Further optimizations to `reduce_rows_f32` 1. Increase threadblock size to better hide latency of memory requests. As a consequence of bigger threadblocks, do 2-step summation, using shared memory to communicate results between invocations 2. Use sum_temp array to reduce waits on sum 3. Adjust num_unroll to reflext bigger threadblock 4. Improve default block_dims, increase support for more block_dims * Add perf tests for `reduce_rows_f32` kernel * Add heuristic to toggle 128/512 threads based on sm count Break even point was the minimum of the following multiples. | GPU Model | Nrow SM Count Multiple | | ----------- | ----------- | | RTX 4000 SFF ADA | 2.0x | | RTX 6000 ADA | 2.5x | | RTX PRO 6000 Blackwell Max-Q | 3.04x | | RTX PRO 4500 Blackwell | 3.15x | * Ensure perf gains also for small ncols and large nrows Alternative to this, one could have also made the number of unrollings template-able, but that would require compiling the kernel multiple times, increasing binary size unnecessarily * Modify perf and unit-tests * Apply auto-formatting by clang * Fix CI build failure See https://github.com/ggml-org/llama.cpp/actions/runs/16798370266/job/47573716079?pr=15132#step:7:486 Building with VS generator worked though. * Remove sm_count property from `ggml_backend_cuda_context` Requested by @JohannesGaessler, and should fix remaining CI issues as a side-effect * Add CUB-based implementation for GGML_OP_MEAN Currently this branch is only executed for nrows==1 * Add heuristics to execute CUB branch only when it brings perf Heuristics were determined on the following HW: * RTX 4000 SFF ADA * RTX 6000 ADA * RTX PRO 6000 Blackwell Max-Q * RTX PRO 4500 Blackwell * Add unit-test for CUB-based mean Tests should run with CUDA Graphs enabled per default on NVGPUs * Rename `USE_CUB` to `GGML_CUDA_USE_CUB` Suggested by @JohannesGaessler * Unindent Preprocessor directives See #15132 (comment)
1 parent bc51822 commit 6028bf7

File tree

6 files changed

+155
-36
lines changed

6 files changed

+155
-36
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@
8787
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
8888
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
8989

90+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
91+
# define GGML_CUDA_USE_CUB
92+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
93+
9094
#ifdef __CUDA_ARCH_LIST__
9195
constexpr bool ggml_cuda_has_arch_impl(int) {
9296
return false;
@@ -420,26 +424,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
420424
#endif // FP16_AVAILABLE
421425
}
422426

423-
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
424-
template<bool norm>
425-
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
426-
const int row = blockIdx.x;
427-
const int col = threadIdx.x;
428-
429-
float sum = 0.0f;
430-
for (int i = col; i < ncols; i += blockDim.x) {
431-
sum += x[row * ncols + i];
432-
}
433-
434-
sum = warp_reduce_sum(sum);
435-
436-
if (col != 0) {
437-
return;
438-
}
439-
440-
dst[row] = norm ? sum / ncols : sum;
441-
}
442-
443427
template<int width = WARP_SIZE>
444428
static __device__ __forceinline__ int warp_reduce_all(int x) {
445429
#ifdef GGML_USE_HIP

ggml/src/ggml-cuda/mean.cu

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
#include "mean.cuh"
2+
#include "reduce_rows.cuh"
3+
4+
#ifdef GGML_CUDA_USE_CUB
5+
#include <cub/cub.cuh>
6+
using namespace cub;
7+
#endif // GGML_CUDA_USE_CUB
8+
9+
template <typename T> __global__ void divide_by_count(T * result, size_t count) {
10+
*result /= static_cast<T>(count);
11+
}
212

313
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
414
const ggml_tensor * src0 = dst->src[0];
@@ -13,7 +23,45 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1323
const int64_t ncols = src0->ne[0];
1424
const int64_t nrows = ggml_nrows(src0);
1525

16-
const dim3 block_dims(WARP_SIZE, 1, 1);
26+
// Special case for reducing vectors
27+
#ifdef GGML_CUDA_USE_CUB
28+
cudaStreamCaptureStatus iscapturing;
29+
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
30+
if ((nrows == 1) &&
31+
// CUDA_GRAPHS_DISABLED
32+
((ncols > 65536) &&
33+
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
34+
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
35+
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
36+
// CUDA_GRAPHS ENABLED
37+
((ncols > 32768) &&
38+
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
39+
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
40+
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
41+
// Single row - use device-wide reduction
42+
size_t tmp_size = 0;
43+
ggml_cuda_pool & pool = ctx.pool();
44+
45+
DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
46+
47+
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
48+
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
49+
50+
// Divide by ncols
51+
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
52+
return;
53+
}
54+
#endif
55+
1756
const dim3 block_nums(nrows, 1, 1);
18-
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
57+
58+
const int id = ggml_cuda_get_device();
59+
const int nsm = ggml_cuda_info().devices[id].nsm;
60+
if ((nrows / nsm) < 2) {
61+
const dim3 block_dims(512, 1, 1);
62+
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
63+
} else {
64+
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
65+
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
66+
}
1967
}

ggml/src/ggml-cuda/reduce_rows.cuh

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include "common.cuh"
2+
3+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
4+
template <bool norm>
5+
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
6+
const int row = blockIdx.x;
7+
const int col = threadIdx.x;
8+
9+
float sum = 0.0f;
10+
const int num_unroll = 8;
11+
float temp[num_unroll];
12+
float sum_temp[num_unroll] = { 0.0f };
13+
for (int i = col; i < ncols;) {
14+
for (int j = 0; j < num_unroll; ++j) {
15+
if (i < ncols) {
16+
temp[j] = x[row * ncols + i];
17+
} else {
18+
temp[j] = 0;
19+
}
20+
i += blockDim.x;
21+
}
22+
for (int j = 0; j < num_unroll; ++j) {
23+
sum_temp[j] += temp[j];
24+
}
25+
}
26+
for (int j = 0; j < num_unroll; ++j) {
27+
sum += sum_temp[j];
28+
}
29+
30+
// sum up partial sums
31+
sum = warp_reduce_sum(sum);
32+
if (blockDim.x > WARP_SIZE) {
33+
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
34+
__shared__ float s_sum[32];
35+
const int warp_id = threadIdx.x / WARP_SIZE;
36+
const int lane_id = threadIdx.x % WARP_SIZE;
37+
if (lane_id == 0) {
38+
s_sum[warp_id] = sum;
39+
}
40+
__syncthreads();
41+
sum = 0.0f;
42+
if (lane_id < (blockDim.x / WARP_SIZE)) {
43+
sum = s_sum[lane_id];
44+
}
45+
sum = warp_reduce_sum(sum);
46+
}
47+
48+
if (col != 0) {
49+
return;
50+
}
51+
52+
dst[row] = norm ? sum / ncols : sum;
53+
}

ggml/src/ggml-cuda/sum.cu

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
1-
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
2-
#define USE_CUB
3-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
1+
#include "sum.cuh"
2+
#include "sumrows.cuh"
43

5-
#ifdef USE_CUB
4+
#ifdef GGML_CUDA_USE_CUB
65
#include <cub/cub.cuh>
76
using namespace cub;
8-
#endif // USE_CUB
9-
10-
#include "sumrows.cuh"
11-
#include "sum.cuh"
7+
#endif // GGML_CUDA_USE_CUB
128

139
#include <cstdint>
1410

1511
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
16-
#ifdef USE_CUB
12+
#ifdef GGML_CUDA_USE_CUB
1713
size_t tmp_size = 0;
1814
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
1915
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
@@ -23,7 +19,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
2319
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
2420
sum_rows_f32_cuda(x, dst, ne, 1, stream);
2521
GGML_UNUSED(pool);
26-
#endif // USE_CUB
22+
#endif // GGML_CUDA_USE_CUB
2723
}
2824

2925
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

ggml/src/ggml-cuda/sumrows.cu

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
1+
#include "reduce_rows.cuh"
12
#include "sumrows.cuh"
23

34
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4-
const dim3 block_dims(WARP_SIZE, 1, 1);
5+
const int id = ggml_cuda_get_device();
6+
const int nsm = ggml_cuda_info().devices[id].nsm;
57
const dim3 block_nums(nrows, 1, 1);
6-
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
8+
if ((nrows / nsm) < 2) {
9+
const dim3 block_dims(512, 1, 1);
10+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
11+
} else {
12+
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
13+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
14+
}
715
}
816

917
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1927
const int64_t ncols = src0->ne[0];
2028
const int64_t nrows = ggml_nrows(src0);
2129

22-
const dim3 block_dims(WARP_SIZE, 1, 1);
2330
const dim3 block_nums(nrows, 1, 1);
2431

25-
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
32+
const int id = ggml_cuda_get_device();
33+
const int nsm = ggml_cuda_info().devices[id].nsm;
34+
if ((nrows / nsm) < 2) {
35+
// Increase num threads to 512 for small nrows to better hide the latency
36+
const dim3 block_dims(512, 1, 1);
37+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
38+
} else {
39+
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
40+
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
41+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
42+
}
2643
}

tests/test-backend-ops.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5998,6 +5998,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
59985998
test_cases.emplace_back(new test_sum());
59995999
test_cases.emplace_back(new test_sum_rows());
60006000
test_cases.emplace_back(new test_mean());
6001+
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
6002+
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
6003+
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
6004+
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
6005+
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
6006+
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
6007+
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
6008+
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
6009+
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
60016010
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
60026011
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
60036012
test_cases.emplace_back(new test_acc());
@@ -6179,6 +6188,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
61796188
test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
61806189
}
61816190

6191+
std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
6192+
{ 8192, 1, 1, 1 },
6193+
{ 8192, 8192, 1, 1 },
6194+
{ 128, 8192, 1, 1 },
6195+
};
6196+
6197+
for (auto it: reduce_rows_cases){
6198+
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it));
6199+
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it));
6200+
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
6201+
}
6202+
61826203
return test_cases;
61836204
}
61846205

0 commit comments

Comments
 (0)