|
| 1 | +#include "argsort.cuh" |
| 2 | +#include "top-k.cuh" |
| 3 | + |
| 4 | +#ifdef GGML_CUDA_USE_CUB |
| 5 | +# include <cub/cub.cuh> |
| 6 | +# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2) |
| 7 | +# define CUB_TOP_K_AVAILABLE |
| 8 | +using namespace cub; |
| 9 | +# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2 |
| 10 | +#endif // GGML_CUDA_USE_CUB |
| 11 | + |
| 12 | +#ifdef CUB_TOP_K_AVAILABLE |
| 13 | +static __global__ void init_indices(int * indices, const int ncols) { |
| 14 | + const int col = blockIdx.x * blockDim.x + threadIdx.x; |
| 15 | + |
| 16 | + if (col < ncols) { |
| 17 | + indices[col] = col; |
| 18 | + } |
| 19 | +} |
| 20 | + |
| 21 | +static void top_k_cub(ggml_cuda_pool & pool, |
| 22 | + const float * src, |
| 23 | + int * dst, |
| 24 | + const int ncols, |
| 25 | + const int k, |
| 26 | + cudaStream_t stream) { |
| 27 | + auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed, |
| 28 | + cuda::execution::output_ordering::unsorted); |
| 29 | + auto stream_env = cuda::stream_ref{ stream }; |
| 30 | + auto env = cuda::std::execution::env{ stream_env, requirements }; |
| 31 | + |
| 32 | + ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols); |
| 33 | + ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols); |
| 34 | + |
| 35 | + int * temp_indices = temp_indices_alloc.get(); |
| 36 | + float * temp_keys = temp_keys_alloc.get(); |
| 37 | + |
| 38 | + static const int block_size = 256; |
| 39 | + const dim3 grid_size((ncols + block_size - 1) / block_size, 1); |
| 40 | + init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols); |
| 41 | + |
| 42 | + CUDA_CHECK(cudaMemcpyAsync(temp_keys, src, ncols * sizeof(float), cudaMemcpyDeviceToDevice, stream)); |
| 43 | + |
| 44 | + size_t temp_storage_bytes = 0; |
| 45 | + DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols, k, env); |
| 46 | + |
| 47 | + ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes); |
| 48 | + void * d_temp_storage = temp_storage_alloc.get(); |
| 49 | + |
| 50 | + DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols, k, env); |
| 51 | +} |
| 52 | + |
| 53 | +#else |
| 54 | + |
| 55 | +static int next_power_of_2(int x) { |
| 56 | + int n = 1; |
| 57 | + while (n < x) { |
| 58 | + n *= 2; |
| 59 | + } |
| 60 | + return n; |
| 61 | +} |
| 62 | + |
| 63 | +#endif // CUB_TOP_K_AVAILABLE |
| 64 | + |
| 65 | +void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 66 | + const ggml_tensor * src0 = dst->src[0]; |
| 67 | + const float * src0_d = (const float *) src0->data; |
| 68 | + int * dst_d = (int *) dst->data; |
| 69 | + cudaStream_t stream = ctx.stream(); |
| 70 | + |
| 71 | + // are these asserts truly necessary? |
| 72 | + GGML_ASSERT(src0->type == GGML_TYPE_F32); |
| 73 | + GGML_ASSERT(dst->type == GGML_TYPE_I32); |
| 74 | + GGML_ASSERT(ggml_is_contiguous(src0)); |
| 75 | + |
| 76 | + const int64_t ncols = src0->ne[0]; |
| 77 | + const int64_t nrows = ggml_nrows(src0); |
| 78 | + const int64_t k = dst->ne[0]; |
| 79 | + ggml_cuda_pool & pool = ctx.pool(); |
| 80 | +#ifdef CUB_TOP_K_AVAILABLE |
| 81 | + // TODO: Switch to `DeviceSegmentedTopK` for multi-row TopK once implemented |
| 82 | + // https://github.com/NVIDIA/cccl/issues/6391 |
| 83 | + // TODO: investigate if there exists a point where parallelized argsort is faster than sequential top-k |
| 84 | + for (int i = 0; i < nrows; i++) { |
| 85 | + top_k_cub(pool, src0_d + i * ncols, dst_d + i * k, ncols, k, stream); |
| 86 | + } |
| 87 | +#else |
| 88 | + // Fall back to argsort + copy |
| 89 | + const int ncols_pad = next_power_of_2(ncols); |
| 90 | + const size_t shared_mem = ncols_pad * sizeof(int); |
| 91 | + const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; |
| 92 | + |
| 93 | + ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows); |
| 94 | + int * tmp_dst = temp_dst_alloc.get(); |
| 95 | + |
| 96 | + if (shared_mem > max_shared_mem || ncols > 1024) { |
| 97 | + argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); |
| 98 | + } else { |
| 99 | + argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); |
| 100 | + } |
| 101 | + CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows, |
| 102 | + cudaMemcpyDeviceToDevice, stream)); |
| 103 | +#endif // CUB_TOP_K_AVAILABLE |
| 104 | +} |
0 commit comments