cuvs/cpp/src/neighbors/ivf_common.cu at 4d6c6bc7de26df1f64f28ded19e1ed46837f773d · jinsolp/cuvs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/*
 * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 * SPDX-License-Identifier: Apache-2.0
 */

#include "ivf_common.cuh"

#include <raft/core/cudart_utils.hpp>
#include <raft/util/pow2_utils.cuh>

#include <cub/cub.cuh>

namespace cuvs::neighbors::ivf::detail {

/**
 * For each query, we calculate a cumulative sum of the cluster sizes that we probe, and return that
 * in chunk_indices. Essentially this is a segmented inclusive scan of the cluster sizes. The total
 * number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples.
 */
template <int BlockDim>
__launch_bounds__(BlockDim) RAFT_KERNEL
  calc_chunk_indices_kernel(uint32_t n_probes,
                            const uint32_t* cluster_sizes,      // [n_clusters]
                            const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                            uint32_t* chunk_indices,            // [n_queries, n_probes]
                            uint32_t* n_samples                 // [n_queries]
  )
{
  using block_scan = cub::BlockScan<uint32_t, BlockDim>;
  __shared__ typename block_scan::TempStorage shm;

  // locate the query data
  clusters_to_probe += n_probes * blockIdx.x;
  chunk_indices += n_probes * blockIdx.x;

  // block scan
  const uint32_t n_probes_aligned = raft::Pow2<BlockDim>::roundUp(n_probes);
  uint32_t total                  = 0;
  for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) {
    auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u;
    auto chunk = probe_ix < n_probes ? cluster_sizes[label] : 0u;
    if (threadIdx.x == 0) { chunk += total; }
    block_scan(shm).InclusiveSum(chunk, chunk, total);
    __syncthreads();
    if (probe_ix < n_probes) { chunk_indices[probe_ix] = chunk; }
  }
  // save the total size
  if (threadIdx.x == 0) { n_samples[blockIdx.x] = total; }
}

void calc_chunk_indices::configured::operator()(const uint32_t* cluster_sizes,
                                                const uint32_t* clusters_to_probe,
                                                uint32_t* chunk_indices,
                                                uint32_t* n_samples,
                                                rmm::cuda_stream_view stream)
{
  void* kernel = nullptr;
  switch (block_dim.x) {
    case 32: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<32>); break;
    case 64: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<64>); break;
    case 128: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<128>); break;
    case 256: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<256>); break;
    case 512: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<512>); break;
    case 1024: kernel = reinterpret_cast<void*>(calc_chunk_indices_kernel<1024>); break;
    default:
      RAFT_FAIL("Unsupported block dimension for calc_chunk_indices::configured() : %d",
                block_dim.x);
  }

  void* args[] =  // NOLINT
    {&n_probes, &cluster_sizes, &clusters_to_probe, &chunk_indices, &n_samples};
  RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
}

}  // namespace cuvs::neighbors::ivf::detail