Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -412,26 +412,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#endif // FP16_AVAILABLE
}

// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
template<bool norm>
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x;
const int col = threadIdx.x;

float sum = 0.0f;
for (int i = col; i < ncols; i += blockDim.x) {
sum += x[row * ncols + i];
}

sum = warp_reduce_sum(sum);

if (col != 0) {
return;
}

dst[row] = norm ? sum / ncols : sum;
}

template<int width = WARP_SIZE>
static __device__ __forceinline__ int warp_reduce_all(int x) {
#ifdef GGML_USE_HIP
Expand Down Expand Up @@ -864,6 +844,7 @@ struct ggml_cuda_graph {
struct ggml_backend_cuda_context {
int device;
std::string name;
int sm_count;
cudaEvent_t copy_event = nullptr;

cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
Expand All @@ -873,7 +854,8 @@ struct ggml_backend_cuda_context {

explicit ggml_backend_cuda_context(int device) :
device(device),
name(GGML_CUDA_NAME + std::to_string(device)) {
name(GGML_CUDA_NAME + std::to_string(device)),
sm_count(ggml_cuda_info().devices[device].nsm) {
}

~ggml_backend_cuda_context();
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/cross-entropy-loss.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
CUDA_CHECK(cudaGetLastError());

// Combine results from individual blocks:
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream, ctx.sm_count);
}

void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down
10 changes: 8 additions & 2 deletions ggml/src/ggml-cuda/mean.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "mean.cuh"
#include "reduce_rows.cuh"

void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
Expand All @@ -13,7 +14,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);

const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
if ((nrows / ctx.sm_count) < 2) {
const dim3 block_dims(512, 1, 1);
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
} else {
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
}
}
53 changes: 53 additions & 0 deletions ggml/src/ggml-cuda/reduce_rows.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "common.cuh"

// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
template <bool norm>
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
const int row = blockIdx.x;
const int col = threadIdx.x;

float sum = 0.0f;
const int num_unroll = 8;
float temp[num_unroll];
float sum_temp[num_unroll] = { 0.0f };
for (int i = col; i < ncols;) {
for (int j = 0; j < num_unroll; ++j) {
if (i < ncols) {
temp[j] = x[row * ncols + i];
} else {
temp[j] = 0;
}
i += blockDim.x;
}
for (int j = 0; j < num_unroll; ++j) {
sum_temp[j] += temp[j];
}
}
for (int j = 0; j < num_unroll; ++j) {
sum += sum_temp[j];
}

// sum up partial sums
sum = warp_reduce_sum(sum);
if (blockDim.x > WARP_SIZE) {
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = sum;
}
__syncthreads();
sum = 0.0f;
if (lane_id < (blockDim.x / WARP_SIZE)) {
sum = s_sum[lane_id];
}
sum = warp_reduce_sum(sum);
}

if (col != 0) {
return;
}

dst[row] = norm ? sum / ncols : sum;
}
4 changes: 2 additions & 2 deletions ggml/src/ggml-cuda/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ using namespace cub;

#include <cstdint>

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream, int & n_sm) {
#ifdef USE_CUB
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
Expand Down Expand Up @@ -41,5 +41,5 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();

sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
sum_f32_cuda(pool, src0_d, dst_d, ne, stream, ctx.sm_count);
}
7 changes: 6 additions & 1 deletion ggml/src/ggml-cuda/sum.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#include "common.cuh"

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
void sum_f32_cuda(ggml_cuda_pool & pool,
const float * x,
float * dst,
const int64_t ne,
cudaStream_t stream,
int & n_sm);

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
28 changes: 23 additions & 5 deletions ggml/src/ggml-cuda/sumrows.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
#include "reduce_rows.cuh"
#include "sumrows.cuh"

void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
void sum_rows_f32_cuda(const float * x,
float * dst,
const int ncols,
const int nrows,
cudaStream_t stream,
int & n_sm) {
const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
if ((nrows / n_sm) < 2) {
const dim3 block_dims(512, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
} else {
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
}
}

void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand All @@ -19,8 +30,15 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);

const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);

reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
if ((nrows / ctx.sm_count) < 2) {
// Increase num threads to 512 for small nrows to better hide the latency
const dim3 block_dims(512, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
} else {
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
}
}
20 changes: 20 additions & 0 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5998,6 +5998,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_sum());
test_cases.emplace_back(new test_sum_rows());
test_cases.emplace_back(new test_mean());
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
Expand Down Expand Up @@ -6179,6 +6187,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
}

std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
{ 8192, 1, 1, 1 },
{ 8192, 8192, 1, 1 },
{ 128, 8192, 1, 1 },
};

for (auto it: reduce_rows_cases){
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it));
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
}

return test_cases;
}

Expand Down
Loading