Skip to content

Commit 3e04706

Browse files
authored
Update cuco implementations to use cuda::std utilities when appropriate (#708)
This PR updates cuco implementations to use `cuda::std` utilities in place of `std` in device code and to replace `thrust` utilities wherever possible, as they are being deprecated in CCCL.
1 parent 65ca487 commit 3e04706

25 files changed

+132
-123
lines changed

include/cuco/detail/bloom_filter/bloom_filter_impl.cuh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,8 @@ class bloom_filter_impl {
183183
// If single thread is optimal, use scalar add
184184
if constexpr (worker_num_threads == 1) {
185185
for (auto i = rank; i < num_keys; i += num_threads) {
186-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + i)};
186+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
187+
*(first + i)};
187188
this->add(insert_element);
188189
}
189190
} else if constexpr (num_threads == worker_num_threads) { // given CG is optimal CG
@@ -193,7 +194,7 @@ class bloom_filter_impl {
193194
auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
194195
for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
195196
if (i + rank < num_keys) {
196-
typename std::iterator_traits<InputIt>::value_type const& insert_element{
197+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
197198
*(first + i + rank)};
198199
hash_value = policy_.hash(insert_element);
199200
block_index = policy_.block_index(hash_value, num_blocks_);
@@ -214,7 +215,7 @@ class bloom_filter_impl {
214215

215216
for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
216217
if (i + rank < num_keys) {
217-
typename std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
218+
typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
218219
hash_value = policy_.hash(key);
219220
block_index = policy_.block_index(hash_value, num_blocks_);
220221
}

include/cuco/detail/bloom_filter/kernels.cuh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717

1818
#include <cuco/detail/utility/cuda.cuh>
1919

20+
#include <cuda/std/iterator>
21+
2022
#include <cooperative_groups.h>
2123

2224
#include <cstdint>
23-
#include <iterator>
2425

2526
namespace cuco::detail::bloom_filter_ns {
2627

@@ -66,7 +67,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
6667

6768
while (idx < n) {
6869
if (pred(*(stencil + idx))) {
69-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
70+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
71+
*(first + idx)};
7072
ref.add(tile, insert_element);
7173
}
7274
idx += loop_stride;
@@ -96,14 +98,14 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
9698

9799
if constexpr (CGSize == 1) {
98100
while (idx < n) {
99-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
101+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
100102
*(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
101103
idx += loop_stride;
102104
}
103105
} else {
104106
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
105107
while (idx < n) {
106-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
108+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
107109
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
108110
if (tile.thread_rank() == 0) { *(out + idx) = found; }
109111
idx += loop_stride;

include/cuco/detail/open_addressing/functors.cuh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
1818
#include <cuco/detail/bitwise_compare.cuh>
1919
#include <cuco/detail/pair/traits.hpp>
2020

21+
#include <thrust/tuple.h>
22+
2123
namespace cuco::detail::open_addressing_ns {
2224

2325
/**
@@ -49,7 +51,7 @@ struct get_slot {
4951
auto const intra_idx = idx % StorageRef::bucket_size;
5052
if constexpr (HasPayload) {
5153
auto const [first, second] = storage_[bucket_idx][intra_idx];
52-
return thrust::make_tuple(first, second);
54+
return thrust::tuple{first, second};
5355
} else {
5456
return storage_[bucket_idx][intra_idx];
5557
}

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -20,11 +20,11 @@
2020
#include <cub/block/block_reduce.cuh>
2121
#include <cuda/atomic>
2222
#include <cuda/functional>
23+
#include <cuda/std/iterator>
24+
#include <cuda/std/type_traits>
2325

2426
#include <cooperative_groups.h>
2527

26-
#include <iterator>
27-
2828
namespace cuco::detail::open_addressing_ns {
2929
CUCO_SUPPRESS_KERNEL_WARNINGS
3030

@@ -77,7 +77,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
7777

7878
while (idx < n) {
7979
if (pred(*(stencil + idx))) {
80-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
80+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
81+
*(first + idx)};
8182
if constexpr (CGSize == 1) {
8283
if (ref.insert(insert_element)) { thread_num_successes++; };
8384
} else {
@@ -135,7 +136,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
135136

136137
while (idx < n) {
137138
if (pred(*(stencil + idx))) {
138-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
139+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
140+
*(first + idx)};
139141
if constexpr (CGSize == 1) {
140142
ref.insert(insert_element);
141143
} else {
@@ -170,7 +172,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
170172
auto idx = cuco::detail::global_thread_id() / CGSize;
171173

172174
while (idx < n) {
173-
typename std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
175+
typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
174176
if constexpr (CGSize == 1) {
175177
ref.erase(erase_element);
176178
} else {
@@ -210,7 +212,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
210212
auto idx = cuco::detail::global_thread_id() / CGSize;
211213

212214
while (idx < n) {
213-
typename std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
215+
typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
214216
if constexpr (CGSize == 1) {
215217
ref.for_each(key, callback_op);
216218
} else {
@@ -273,7 +275,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
273275
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
274276
if constexpr (CGSize == 1) {
275277
if (idx < n) {
276-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
278+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
277279
/*
278280
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
279281
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -287,7 +289,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
287289
} else {
288290
auto const tile = cg::tiled_partition<CGSize>(block);
289291
if (idx < n) {
290-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
292+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
291293
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
292294
if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
293295
}
@@ -367,7 +369,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
367369
using output_type = typename find_buffer<Ref>::type;
368370
__shared__ output_type output_buffer[BlockSize / CGSize];
369371

370-
auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
372+
auto constexpr has_payload =
373+
not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
371374

372375
auto const sentinel = [&]() {
373376
if constexpr (has_payload) {
@@ -388,8 +391,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
388391
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
389392
if constexpr (CGSize == 1) {
390393
if (idx < n) {
391-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
392-
auto const found = ref.find(key);
394+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
395+
auto const found = ref.find(key);
393396
/*
394397
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
395398
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -403,8 +406,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
403406
} else {
404407
auto const tile = cg::tiled_partition<CGSize>(block);
405408
if (idx < n) {
406-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
407-
auto const found = ref.find(tile, key);
409+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
410+
auto const found = ref.find(tile, key);
408411

409412
if (tile.thread_rank() == 0) {
410413
*(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -461,7 +464,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
461464

462465
using output_type = typename find_buffer<Ref>::type;
463466

464-
auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
467+
auto constexpr has_payload =
468+
not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
465469

466470
auto output = cuda::proclaim_return_type<output_type>([&] __device__(auto found) {
467471
if constexpr (has_payload) {
@@ -477,7 +481,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
477481
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
478482
if constexpr (CGSize == 1) {
479483
if (idx < n) {
480-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
484+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
485+
*(first + idx)};
481486
auto const [iter, inserted] = ref.insert_and_find(insert_element);
482487
/*
483488
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
@@ -496,7 +501,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
496501
} else {
497502
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
498503
if (idx < n) {
499-
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
504+
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
505+
*(first + idx)};
500506
auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
501507
if (tile.thread_rank() == 0) {
502508
*(found_begin + idx) = output(iter);
@@ -546,7 +552,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
546552
auto idx = cuco::detail::global_thread_id() / CGSize;
547553

548554
while (idx < n) {
549-
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
555+
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
550556
if constexpr (CGSize == 1) {
551557
if constexpr (IsOuter) {
552558
thread_count += max(ref.count(key), outer_min_count);

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,12 @@ class open_addressing_impl {
7878
"Key type must have unique object representations or have been explicitly declared as safe for "
7979
"bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
8080

81-
static_assert(
82-
std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>, ProbingScheme>,
83-
"ProbingScheme must inherit from cuco::detail::probing_scheme_base");
81+
static_assert(cuda::std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>,
82+
ProbingScheme>,
83+
"ProbingScheme must inherit from cuco::detail::probing_scheme_base");
8484

8585
/// Determines if the container is a key/value or key-only store
86-
static constexpr auto has_payload = not std::is_same_v<Key, Value>;
86+
static constexpr auto has_payload = not cuda::std::is_same_v<Key, Value>;
8787

8888
public:
8989
static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used for probing

include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,13 @@
3030
#include <thrust/execution_policy.h>
3131
#include <thrust/logical.h>
3232
#include <thrust/reduce.h>
33-
#include <thrust/tuple.h>
3433
#if defined(CUCO_HAS_CUDA_BARRIER)
3534
#include <cuda/barrier>
3635
#endif
3736

3837
#include <cooperative_groups.h>
3938

4039
#include <cstdint>
41-
#include <type_traits>
4240

4341
namespace cuco {
4442
namespace detail {
@@ -97,12 +95,13 @@ class open_addressing_ref_impl {
9795
"Key type must have unique object representations or have been explicitly declared as safe for "
9896
"bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
9997

100-
static_assert(
101-
std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>, ProbingScheme>,
102-
"ProbingScheme must inherit from cuco::detail::probing_scheme_base");
98+
static_assert(cuda::std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>,
99+
ProbingScheme>,
100+
"ProbingScheme must inherit from cuco::detail::probing_scheme_base");
103101

104102
/// Determines if the container is a key/value or key-only store
105-
static constexpr auto has_payload = not std::is_same_v<Key, typename StorageRef::value_type>;
103+
static constexpr auto has_payload =
104+
not cuda::std::is_same_v<Key, typename StorageRef::value_type>;
106105

107106
/// Flag indicating whether duplicate keys are allowed or not
108107
static constexpr auto allows_duplicates = AllowsDuplicates;
@@ -187,7 +186,7 @@ class open_addressing_ref_impl {
187186
*
188187
* @return The sentinel value used to represent an empty payload slot
189188
*/
190-
template <bool Dummy = true, typename Enable = std::enable_if_t<has_payload and Dummy>>
189+
template <bool Dummy = true, typename Enable = cuda::std::enable_if_t<has_payload and Dummy>>
191190
[[nodiscard]] __host__ __device__ constexpr auto empty_value_sentinel() const noexcept
192191
{
193192
return this->extract_payload(this->empty_slot_sentinel());
@@ -514,7 +513,7 @@ class open_addressing_ref_impl {
514513
* insertion is successful or not.
515514
*/
516515
template <typename Value>
517-
__device__ thrust::pair<iterator, bool> insert_and_find(Value const& value) noexcept
516+
__device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
518517
{
519518
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
520519
#if __CUDA_ARCH__ < 700
@@ -587,7 +586,7 @@ class open_addressing_ref_impl {
587586
* insertion is successful or not.
588587
*/
589588
template <typename Value>
590-
__device__ thrust::pair<iterator, bool> insert_and_find(
589+
__device__ cuda::std::pair<iterator, bool> insert_and_find(
591590
cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
592591
{
593592
#if __CUDA_ARCH__ < 700
@@ -1157,7 +1156,7 @@ class open_addressing_ref_impl {
11571156

11581157
if (n == 0) { return; }
11591158

1160-
using probe_type = typename std::iterator_traits<InputProbeIt>::value_type;
1159+
using probe_type = typename cuda::std::iterator_traits<InputProbeIt>::value_type;
11611160

11621161
// tuning parameter
11631162
auto constexpr buffer_multiplier = 1;
@@ -1514,7 +1513,7 @@ class open_addressing_ref_impl {
15141513
*
15151514
* @return The payload
15161515
*/
1517-
template <typename Value, typename Enable = std::enable_if_t<has_payload and sizeof(Value)>>
1516+
template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
15181517
[[nodiscard]] __device__ constexpr auto extract_payload(Value const& value) const noexcept
15191518
{
15201519
return thrust::raw_reference_cast(value).second;

include/cuco/detail/pair/helpers.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -83,7 +83,8 @@ using packed_t = typename packed<sizeof(Pair)>::type;
8383
template <typename Pair>
8484
__host__ __device__ constexpr bool is_packable()
8585
{
86-
return not std::is_void<packed_t<Pair>>::value and std::has_unique_object_representations_v<Pair>;
86+
return not cuda::std::is_void<packed_t<Pair>>::value and
87+
cuda::std::has_unique_object_representations_v<Pair>;
8788
}
8889

8990
/**

include/cuco/detail/pair/pair.inl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
1616

1717
#pragma once
1818

19-
#include <type_traits>
20-
#include <utility>
19+
#include <cuda/std/type_traits>
20+
#include <cuda/std/utility>
2121

2222
namespace cuco {
2323

@@ -35,10 +35,11 @@ __host__ __device__ constexpr pair<First, Second>::pair(pair<F, S> const& p)
3535
}
3636

3737
template <typename F, typename S>
38-
__host__ __device__ constexpr pair<std::decay_t<F>, std::decay_t<S>> make_pair(F&& f,
39-
S&& s) noexcept
38+
__host__ __device__ constexpr pair<cuda::std::decay_t<F>, cuda::std::decay_t<S>> make_pair(
39+
F&& f, S&& s) noexcept
4040
{
41-
return pair<std::decay_t<F>, std::decay_t<S>>(std::forward<F>(f), std::forward<S>(s));
41+
return pair<cuda::std::decay_t<F>, cuda::std::decay_t<S>>(cuda::std::forward<F>(f),
42+
cuda::std::forward<S>(s));
4243
}
4344

4445
template <class T1, class T2, class U1, class U2>

0 commit comments

Comments
 (0)