NVIDIA
diff --git a/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 4 additions & 3 deletions b/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions b/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 4 additions & 2 deletions b/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎include/cuco/detail/open_addressing/kernels.cuh‎
Lines changed: 24 additions & 18 deletions b/‎include/cuco/detail/open_addressing/kernels.cuh‎
Lines changed: 24 additions & 18 deletions
diff --git a/‎include/cuco/detail/open_addressing/open_addressing_impl.cuh‎
Lines changed: 4 additions & 4 deletions b/‎include/cuco/detail/open_addressing/open_addressing_impl.cuh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh‎
Lines changed: 10 additions & 11 deletions b/‎include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎include/cuco/detail/pair/helpers.cuh‎
Lines changed: 3 additions & 2 deletions b/‎include/cuco/detail/pair/helpers.cuh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/cuco/detail/pair/pair.inl‎
Lines changed: 7 additions & 6 deletions b/‎include/cuco/detail/pair/pair.inl‎
Lines changed: 7 additions & 6 deletions
@@ -183,7 +183,8 @@ class bloom_filter_impl {
     // If single thread is optimal, use scalar add
     if constexpr (worker_num_threads == 1) {
       for (auto i = rank; i < num_keys; i += num_threads) {
-        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + i)};
+        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+          *(first + i)};
         this->add(insert_element);
       }
     } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
@@ -193,7 +194,7 @@ class bloom_filter_impl {
       auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
       for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
         if (i + rank < num_keys) {
-          typename std::iterator_traits<InputIt>::value_type const& insert_element{
+          typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
             *(first + i + rank)};
           hash_value  = policy_.hash(insert_element);
           block_index = policy_.block_index(hash_value, num_blocks_);
@@ -214,7 +215,7 @@ class bloom_filter_impl {
 
       for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
         if (i + rank < num_keys) {
-          typename std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
+          typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
           hash_value  = policy_.hash(key);
           block_index = policy_.block_index(hash_value, num_blocks_);
         }
 
@@ -17,10 +17,11 @@
 
 #include <cuco/detail/utility/cuda.cuh>
 
+#include <cuda/std/iterator>
+
 #include <cooperative_groups.h>
 
 #include <cstdint>
-#include <iterator>
 
 namespace cuco::detail::bloom_filter_ns {
 
@@ -66,7 +67,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        *(first + idx)};
       ref.add(tile, insert_element);
     }
     idx += loop_stride;
@@ -96,14 +98,14 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
 
   if constexpr (CGSize == 1) {
     while (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
       idx += loop_stride;
     }
   } else {
     auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
     while (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
       if (tile.thread_rank() == 0) { *(out + idx) = found; }
       idx += loop_stride;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/pair/traits.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuco::detail::open_addressing_ns {
 
 /**
@@ -49,7 +51,7 @@ struct get_slot {
     auto const intra_idx  = idx % StorageRef::bucket_size;
     if constexpr (HasPayload) {
       auto const [first, second] = storage_[bucket_idx][intra_idx];
-      return thrust::make_tuple(first, second);
+      return thrust::tuple{first, second};
     } else {
       return storage_[bucket_idx][intra_idx];
     }
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,11 @@
 #include <cub/block/block_reduce.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <cuda/std/iterator>
+#include <cuda/std/type_traits>
 
 #include <cooperative_groups.h>
 
-#include <iterator>
-
 namespace cuco::detail::open_addressing_ns {
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
@@ -77,7 +77,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        *(first + idx)};
       if constexpr (CGSize == 1) {
         if (ref.insert(insert_element)) { thread_num_successes++; };
       } else {
@@ -135,7 +136,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        *(first + idx)};
       if constexpr (CGSize == 1) {
         ref.insert(insert_element);
       } else {
@@ -170,7 +172,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.erase(erase_element);
     } else {
@@ -210,7 +212,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.for_each(key, callback_op);
     } else {
@@ -273,7 +275,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -287,7 +289,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
         if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
       }
@@ -367,7 +369,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
   using output_type = typename find_buffer<Ref>::type;
   __shared__ output_type output_buffer[BlockSize / CGSize];
 
-  auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
+  auto constexpr has_payload =
+    not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
 
   auto const sentinel = [&]() {
     if constexpr (has_payload) {
@@ -388,8 +391,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                              = ref.find(key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        auto const found                                                    = ref.find(key);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -403,8 +406,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                              = ref.find(tile, key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        auto const found                                                    = ref.find(tile, key);
 
         if (tile.thread_rank() == 0) {
           *(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -461,7 +464,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
 
   using output_type = typename find_buffer<Ref>::type;
 
-  auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
+  auto constexpr has_payload =
+    not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
 
   auto output = cuda::proclaim_return_type<output_type>([&] __device__(auto found) {
     if constexpr (has_payload) {
@@ -477,7 +481,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+          *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(insert_element);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
@@ -496,7 +501,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
       if (idx < n) {
-        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+          *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
         if (tile.thread_rank() == 0) {
           *(found_begin + idx)    = output(iter);
@@ -546,7 +552,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+    typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
     if constexpr (CGSize == 1) {
       if constexpr (IsOuter) {
         thread_count += max(ref.count(key), outer_min_count);
 
@@ -78,12 +78,12 @@ class open_addressing_impl {
     "Key type must have unique object representations or have been explicitly declared as safe for "
     "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
 
-  static_assert(
-    std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>, ProbingScheme>,
-    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+  static_assert(cuda::std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                                        ProbingScheme>,
+                "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
 
   /// Determines if the container is a key/value or key-only store
-  static constexpr auto has_payload = not std::is_same_v<Key, Value>;
+  static constexpr auto has_payload = not cuda::std::is_same_v<Key, Value>;
 
  public:
   static constexpr auto cg_size      = ProbingScheme::cg_size;  ///< CG size used for probing
 
@@ -30,15 +30,13 @@
 #include <thrust/execution_policy.h>
 #include <thrust/logical.h>
 #include <thrust/reduce.h>
-#include <thrust/tuple.h>
 #if defined(CUCO_HAS_CUDA_BARRIER)
 #include <cuda/barrier>
 #endif
 
 #include <cooperative_groups.h>
 
 #include <cstdint>
-#include <type_traits>
 
 namespace cuco {
 namespace detail {
@@ -97,12 +95,13 @@ class open_addressing_ref_impl {
     "Key type must have unique object representations or have been explicitly declared as safe for "
     "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
 
-  static_assert(
-    std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>, ProbingScheme>,
-    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+  static_assert(cuda::std::is_base_of_v<cuco::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                                        ProbingScheme>,
+                "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
 
   /// Determines if the container is a key/value or key-only store
-  static constexpr auto has_payload = not std::is_same_v<Key, typename StorageRef::value_type>;
+  static constexpr auto has_payload =
+    not cuda::std::is_same_v<Key, typename StorageRef::value_type>;
 
   /// Flag indicating whether duplicate keys are allowed or not
   static constexpr auto allows_duplicates = AllowsDuplicates;
@@ -187,7 +186,7 @@ class open_addressing_ref_impl {
    *
    * @return The sentinel value used to represent an empty payload slot
    */
-  template <bool Dummy = true, typename Enable = std::enable_if_t<has_payload and Dummy>>
+  template <bool Dummy = true, typename Enable = cuda::std::enable_if_t<has_payload and Dummy>>
   [[nodiscard]] __host__ __device__ constexpr auto empty_value_sentinel() const noexcept
   {
     return this->extract_payload(this->empty_slot_sentinel());
@@ -514,7 +513,7 @@ class open_addressing_ref_impl {
    * insertion is successful or not.
    */
   template <typename Value>
-  __device__ thrust::pair<iterator, bool> insert_and_find(Value const& value) noexcept
+  __device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 #if __CUDA_ARCH__ < 700
@@ -587,7 +586,7 @@ class open_addressing_ref_impl {
    * insertion is successful or not.
    */
   template <typename Value>
-  __device__ thrust::pair<iterator, bool> insert_and_find(
+  __device__ cuda::std::pair<iterator, bool> insert_and_find(
     cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
   {
 #if __CUDA_ARCH__ < 700
@@ -1157,7 +1156,7 @@ class open_addressing_ref_impl {
 
     if (n == 0) { return; }
 
-    using probe_type = typename std::iterator_traits<InputProbeIt>::value_type;
+    using probe_type = typename cuda::std::iterator_traits<InputProbeIt>::value_type;
 
     // tuning parameter
     auto constexpr buffer_multiplier = 1;
@@ -1514,7 +1513,7 @@ class open_addressing_ref_impl {
    *
    * @return The payload
    */
-  template <typename Value, typename Enable = std::enable_if_t<has_payload and sizeof(Value)>>
+  template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
   [[nodiscard]] __device__ constexpr auto extract_payload(Value const& value) const noexcept
   {
     return thrust::raw_reference_cast(value).second;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,8 @@ using packed_t = typename packed<sizeof(Pair)>::type;
 template <typename Pair>
 __host__ __device__ constexpr bool is_packable()
 {
-  return not std::is_void<packed_t<Pair>>::value and std::has_unique_object_representations_v<Pair>;
+  return not cuda::std::is_void<packed_t<Pair>>::value and
+         cuda::std::has_unique_object_representations_v<Pair>;
 }
 
 /**
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <type_traits>
-#include <utility>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 namespace cuco {
 
@@ -35,10 +35,11 @@ __host__ __device__ constexpr pair<First, Second>::pair(pair<F, S> const& p)
 }
 
 template <typename F, typename S>
-__host__ __device__ constexpr pair<std::decay_t<F>, std::decay_t<S>> make_pair(F&& f,
-                                                                               S&& s) noexcept
+__host__ __device__ constexpr pair<cuda::std::decay_t<F>, cuda::std::decay_t<S>> make_pair(
+  F&& f, S&& s) noexcept
 {
-  return pair<std::decay_t<F>, std::decay_t<S>>(std::forward<F>(f), std::forward<S>(s));
+  return pair<cuda::std::decay_t<F>, cuda::std::decay_t<S>>(cuda::std::forward<F>(f),
+                                                            cuda::std::forward<S>(s));
 }
 
 template <class T1, class T2, class U1, class U2>
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021-2024, NVIDIA CORPORATION.`
	`2`	`+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`
`@@ -83,7 +83,8 @@ using packed_t = typename packed<sizeof(Pair)>::type;`
`83`	`83`	`template <typename Pair>`
`84`	`84`	`__host__ __device__ constexpr bool is_packable()`
`85`	`85`	`{`
`86`		`- return not std::is_void<packed_t<Pair>>::value and std::has_unique_object_representations_v<Pair>;`
	`86`	`+ return not cuda::std::is_void<packed_t<Pair>>::value and`
	`87`	`+ cuda::std::has_unique_object_representations_v<Pair>;`
`87`	`88`	`}`
`88`	`89`
`89`	`90`	`/**`