11/*
2- * Copyright (c) 2023-2024 , NVIDIA CORPORATION.
2+ * Copyright (c) 2023-2025 , NVIDIA CORPORATION.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
2020#include < cub/block/block_reduce.cuh>
2121#include < cuda/atomic>
2222#include < cuda/functional>
23+ #include < cuda/std/iterator>
24+ #include < cuda/std/type_traits>
2325
2426#include < cooperative_groups.h>
2527
26- #include < iterator>
27-
2828namespace cuco ::detail::open_addressing_ns {
2929CUCO_SUPPRESS_KERNEL_WARNINGS
3030
@@ -77,7 +77,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
7777
7878 while (idx < n) {
7979 if (pred (*(stencil + idx))) {
80- typename std::iterator_traits<InputIt>::value_type const & insert_element{*(first + idx)};
80+ typename cuda::std::iterator_traits<InputIt>::value_type const & insert_element{
81+ *(first + idx)};
8182 if constexpr (CGSize == 1 ) {
8283 if (ref.insert (insert_element)) { thread_num_successes++; };
8384 } else {
@@ -135,7 +136,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
135136
136137 while (idx < n) {
137138 if (pred (*(stencil + idx))) {
138- typename std::iterator_traits<InputIt>::value_type const & insert_element{*(first + idx)};
139+ typename cuda::std::iterator_traits<InputIt>::value_type const & insert_element{
140+ *(first + idx)};
139141 if constexpr (CGSize == 1 ) {
140142 ref.insert (insert_element);
141143 } else {
@@ -170,7 +172,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
170172 auto idx = cuco::detail::global_thread_id () / CGSize;
171173
172174 while (idx < n) {
173- typename std::iterator_traits<InputIt>::value_type const & erase_element{*(first + idx)};
175+ typename cuda:: std::iterator_traits<InputIt>::value_type const & erase_element{*(first + idx)};
174176 if constexpr (CGSize == 1 ) {
175177 ref.erase (erase_element);
176178 } else {
@@ -210,7 +212,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
210212 auto idx = cuco::detail::global_thread_id () / CGSize;
211213
212214 while (idx < n) {
213- typename std::iterator_traits<InputIt>::value_type const & key{*(first + idx)};
215+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key{*(first + idx)};
214216 if constexpr (CGSize == 1 ) {
215217 ref.for_each (key, callback_op);
216218 } else {
@@ -273,7 +275,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
273275 while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
274276 if constexpr (CGSize == 1 ) {
275277 if (idx < n) {
276- typename std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
278+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
277279 /*
278280 * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
279281 * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -287,7 +289,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
287289 } else {
288290 auto const tile = cg::tiled_partition<CGSize>(block);
289291 if (idx < n) {
290- typename std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
292+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
291293 auto const found = pred (*(stencil + idx)) ? ref.contains (tile, key) : false ;
292294 if (tile.thread_rank () == 0 ) { *(output_begin + idx) = found; }
293295 }
@@ -367,7 +369,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
367369 using output_type = typename find_buffer<Ref>::type;
368370 __shared__ output_type output_buffer[BlockSize / CGSize];
369371
370- auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
372+ auto constexpr has_payload =
373+ not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
371374
372375 auto const sentinel = [&]() {
373376 if constexpr (has_payload) {
@@ -388,8 +391,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
388391 while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
389392 if constexpr (CGSize == 1 ) {
390393 if (idx < n) {
391- typename std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
392- auto const found = ref.find (key);
394+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
395+ auto const found = ref.find (key);
393396 /*
394397 * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
395398 * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -403,8 +406,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
403406 } else {
404407 auto const tile = cg::tiled_partition<CGSize>(block);
405408 if (idx < n) {
406- typename std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
407- auto const found = ref.find (tile, key);
409+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
410+ auto const found = ref.find (tile, key);
408411
409412 if (tile.thread_rank () == 0 ) {
410413 *(output_begin + idx) = pred (*(stencil + idx)) ? output (found) : sentinel;
@@ -461,7 +464,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
461464
462465 using output_type = typename find_buffer<Ref>::type;
463466
464- auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
467+ auto constexpr has_payload =
468+ not cuda::std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
465469
466470 auto output = cuda::proclaim_return_type<output_type>([&] __device__ (auto found) {
467471 if constexpr (has_payload) {
@@ -477,7 +481,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
477481 while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
478482 if constexpr (CGSize == 1 ) {
479483 if (idx < n) {
480- typename std::iterator_traits<InputIt>::value_type const & insert_element{*(first + idx)};
484+ typename cuda::std::iterator_traits<InputIt>::value_type const & insert_element{
485+ *(first + idx)};
481486 auto const [iter, inserted] = ref.insert_and_find (insert_element);
482487 /*
483488 * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
@@ -496,7 +501,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
496501 } else {
497502 auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block ());
498503 if (idx < n) {
499- typename std::iterator_traits<InputIt>::value_type const & insert_element{*(first + idx)};
504+ typename cuda::std::iterator_traits<InputIt>::value_type const & insert_element{
505+ *(first + idx)};
500506 auto const [iter, inserted] = ref.insert_and_find (tile, insert_element);
501507 if (tile.thread_rank () == 0 ) {
502508 *(found_begin + idx) = output (iter);
@@ -546,7 +552,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
546552 auto idx = cuco::detail::global_thread_id () / CGSize;
547553
548554 while (idx < n) {
549- typename std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
555+ typename cuda:: std::iterator_traits<InputIt>::value_type const & key = *(first + idx);
550556 if constexpr (CGSize == 1 ) {
551557 if constexpr (IsOuter) {
552558 thread_count += max (ref.count (key), outer_min_count);
0 commit comments