Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions ggml/src/ggml-cuda/sum.cu
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#define USE_CUB
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700

#ifdef USE_CUB
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
// For this reason CUB must be included BEFORE anything else.
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#endif // USE_CUB

#include "sumrows.cuh"
#include "sum.cuh"

#include <cstdint>

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#ifdef USE_CUB
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
Expand All @@ -21,7 +25,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#endif // USE_CUB
}

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down
Loading