forked from rapidsai/cuvs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathivf_common.cu
More file actions
75 lines (65 loc) · 3.17 KB
/
ivf_common.cu
File metadata and controls
75 lines (65 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#include "ivf_common.cuh"
#include <raft/core/cudart_utils.hpp>
#include <raft/util/pow2_utils.cuh>
#include <cub/cub.cuh>
namespace cuvs::neighbors::ivf::detail {
/**
* For each query, we calculate a cumulative sum of the cluster sizes that we probe, and return that
* in chunk_indices. Essentially this is a segmented inclusive scan of the cluster sizes. The total
* number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples.
*/
template <int BlockDim>
__launch_bounds__(BlockDim) RAFT_KERNEL
calc_chunk_indices_kernel(uint32_t n_probes,
const uint32_t* cluster_sizes, // [n_clusters]
const uint32_t* clusters_to_probe, // [n_queries, n_probes]
uint32_t* chunk_indices, // [n_queries, n_probes]
uint32_t* n_samples // [n_queries]
)
{
using block_scan = cub::BlockScan<uint32_t, BlockDim>;
__shared__ typename block_scan::TempStorage shm;
// locate the query data
clusters_to_probe += n_probes * blockIdx.x;
chunk_indices += n_probes * blockIdx.x;
// block scan
const uint32_t n_probes_aligned = raft::Pow2<BlockDim>::roundUp(n_probes);
uint32_t total = 0;
for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) {
auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u;
auto chunk = probe_ix < n_probes ? cluster_sizes[label] : 0u;
if (threadIdx.x == 0) { chunk += total; }
block_scan(shm).InclusiveSum(chunk, chunk, total);
__syncthreads();
if (probe_ix < n_probes) { chunk_indices[probe_ix] = chunk; }
}
// save the total size
if (threadIdx.x == 0) { n_samples[blockIdx.x] = total; }
}
void calc_chunk_indices::configured::operator()(const uint32_t* cluster_sizes,
const uint32_t* clusters_to_probe,
uint32_t* chunk_indices,
uint32_t* n_samples,
rmm::cuda_stream_view stream)
{
void* kernel = nullptr;
switch (block_dim.x) {
case 32: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<32>); break;
case 64: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<64>); break;
case 128: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<128>); break;
case 256: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<256>); break;
case 512: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<512>); break;
case 1024: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<1024>); break;
default:
RAFT_FAIL("Unsupported block dimension for calc_chunk_indices::configured() : %d",
block_dim.x);
}
void* args[] = // NOLINT
{&n_probes, &cluster_sizes, &clusters_to_probe, &chunk_indices, &n_samples};
RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
}
} // namespace cuvs::neighbors::ivf::detail