NVIDIA
diff --git a/‎include/cuco/detail/bitwise_compare.cuh‎
Lines changed: 1 addition & 1 deletion b/‎include/cuco/detail/bitwise_compare.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 2 additions & 2 deletions b/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/cuco/detail/open_addressing/kernels.cuh‎
Lines changed: 14 additions & 16 deletions b/‎include/cuco/detail/open_addressing/kernels.cuh‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh‎
Lines changed: 32 additions & 35 deletions b/‎include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh‎
Lines changed: 32 additions & 35 deletions
@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()
  * @return If the bits in the object representations of lhs and rhs are identical.
  */
 template <typename T>
-__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)
+__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)
 {
   static_assert(
     cuco::is_bitwise_comparable_v<T>,
 
@@ -73,7 +73,7 @@ struct slot_is_filled {
    * @param empty_sentinel Key sentinel indicating an empty slot
    * @param erased_sentinel Key sentinel indicating an erased slot
    */
-  explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept
+  explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept
     : empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}
   {
   }
@@ -88,7 +88,7 @@ struct slot_is_filled {
    * @return `true` if slot is filled
    */
   template <typename S>
-  __device__ constexpr bool operator()(S const& slot) const noexcept
+  __device__ constexpr bool operator()(S slot) const noexcept
   {
     auto const key = [&]() {
       if constexpr (HasPayload) {
 
@@ -78,8 +78,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         if (ref.insert(insert_element)) { thread_num_successes++; };
       } else {
@@ -138,8 +137,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         ref.insert(insert_element);
       } else {
@@ -175,7 +173,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const erase_element{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.erase(erase_element);
     } else {
@@ -216,7 +214,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.for_each(key, callback_op);
     } else {
@@ -280,7 +278,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -294,7 +292,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
         auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
         if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
       }
@@ -396,8 +394,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                                    = ref.find(key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
+        auto const found                                                   = ref.find(key);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -411,8 +409,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                                    = ref.find(tile, key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
+        auto const found                                                   = ref.find(tile, key);
 
         if (tile.thread_rank() == 0) {
           *(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -486,7 +484,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
           *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(insert_element);
         /*
@@ -506,7 +504,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
           *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
         if (tile.thread_rank() == 0) {
@@ -557,7 +555,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+    typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
     if constexpr (CGSize == 1) {
       if constexpr (IsOuter) {
         thread_count += max(ref.count(key), outer_min_count);
@@ -617,7 +615,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
   size_type constexpr outer_min_count = 1;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+    typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
     if constexpr (CGSize == 1) {
       if constexpr (IsOuter) {
         *(output_begin + idx) = max(ref.count(key), size_type{outer_min_count});
 
@@ -372,7 +372,7 @@ class open_addressing_ref_impl {
    * @return True if the given element is successfully inserted
    */
   template <typename Value>
-  __device__ bool insert(Value const& value) noexcept
+  __device__ bool insert(Value value) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -428,7 +428,7 @@ class open_addressing_ref_impl {
    */
   template <bool SupportsErase, typename Value, typename ParentCG>
   __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-                         Value const& value) noexcept
+                         Value value) noexcept
   {
     auto const val = this->heterogeneous_value(value);
     auto const key = this->extract_key(val);
@@ -513,7 +513,7 @@ class open_addressing_ref_impl {
    * insertion is successful or not.
    */
   template <typename Value>
-  __device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
+  __device__ cuda::std::pair<iterator, bool> insert_and_find(Value value) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 #if __CUDA_ARCH__ < 700
@@ -589,7 +589,7 @@ class open_addressing_ref_impl {
    */
   template <typename Value, typename ParentCG>
   __device__ cuda::std::pair<iterator, bool> insert_and_find(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value value) noexcept
   {
 #if __CUDA_ARCH__ < 700
     // Spinning to ensure that the write to the value part took place requires
@@ -680,12 +680,12 @@ class open_addressing_ref_impl {
    *
    * @tparam ProbeKey Input type which is convertible to 'key_type'
    *
-   * @param value The element to erase
+   * @param key The element to erase
    *
    * @return True if the given element is successfully erased
    */
   template <typename ProbeKey>
-  __device__ bool erase(ProbeKey const& key) noexcept
+  __device__ bool erase(ProbeKey key) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -726,13 +726,13 @@ class open_addressing_ref_impl {
    * @tparam ParentCG Type of parent Cooperative Group
    *
    * @param group The Cooperative Group used to perform group erase
-   * @param value The element to erase
+   * @param key The element to erase
    *
    * @return True if the given element is successfully erased
    */
   template <typename ProbeKey, typename ParentCG>
   __device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-                        ProbeKey const& key) noexcept
+                        ProbeKey key) noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -790,7 +790,7 @@ class open_addressing_ref_impl {
    * @return A boolean indicating whether the probe key is present
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ bool contains(ProbeKey key) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -830,8 +830,7 @@ class open_addressing_ref_impl {
    */
   template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-    ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -871,7 +870,7 @@ class open_addressing_ref_impl {
    * @return An iterator to the position at which the equivalent key is stored
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ iterator find(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ iterator find(ProbeKey key) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -915,8 +914,7 @@ class open_addressing_ref_impl {
    */
   template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ iterator
-  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-       ProbeKey const& key) const noexcept
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -964,7 +962,7 @@ class open_addressing_ref_impl {
    * @return Number of occurrences found by the current thread
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ size_type count(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ size_type count(ProbeKey key) const noexcept
   {
     if constexpr (not allows_duplicates) {
       return static_cast<size_type>(this->contains(key));
@@ -1013,8 +1011,7 @@ class open_addressing_ref_impl {
    */
   template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ size_type
-  count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-        ProbeKey const& key) const noexcept
+  count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1369,7 +1366,7 @@ class open_addressing_ref_impl {
    * @param callback_op Function to apply to every matched slot
    */
   template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
+  __device__ void for_each(ProbeKey key, CallbackOp&& callback_op) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -1420,7 +1417,7 @@ class open_addressing_ref_impl {
    */
   template <class ProbeKey, class CallbackOp, typename ParentCG>
   __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-                           ProbeKey const& key,
+                           ProbeKey key,
                            CallbackOp&& callback_op) const noexcept
   {
     auto probing_iter =
@@ -1485,7 +1482,7 @@ class open_addressing_ref_impl {
    */
   template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
   __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
-                           ProbeKey const& key,
+                           ProbeKey key,
                            CallbackOp&& callback_op,
                            SyncOp&& sync_op) const noexcept
   {
@@ -1545,7 +1542,7 @@ class open_addressing_ref_impl {
    * @return The key
    */
   template <typename Value>
-  [[nodiscard]] __host__ __device__ constexpr auto extract_key(Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto extract_key(Value value) const noexcept
   {
     if constexpr (has_payload) {
       return thrust::raw_reference_cast(value).first;
@@ -1566,8 +1563,7 @@ class open_addressing_ref_impl {
    * @return The payload
    */
   template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
-  [[nodiscard]] __host__ __device__ constexpr auto extract_payload(
-    Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto extract_payload(Value value) const noexcept
   {
     return thrust::raw_reference_cast(value).second;
   }
@@ -1582,7 +1578,7 @@ class open_addressing_ref_impl {
    * @return The converted object
    */
   template <typename T>
-  [[nodiscard]] __device__ constexpr value_type native_value(T const& value) const noexcept
+  [[nodiscard]] __device__ constexpr value_type native_value(T value) const noexcept
   {
     if constexpr (has_payload) {
       return {static_cast<key_type>(this->extract_key(value)), this->extract_payload(value)};
@@ -1602,7 +1598,7 @@ class open_addressing_ref_impl {
    * @return The converted object
    */
   template <typename T>
-  [[nodiscard]] __device__ constexpr auto heterogeneous_value(T const& value) const noexcept
+  [[nodiscard]] __device__ constexpr auto heterogeneous_value(T value) const noexcept
   {
     if constexpr (has_payload and not cuda::std::is_same_v<T, value_type>) {
       using mapped_type = decltype(this->empty_value_sentinel());
@@ -1624,7 +1620,7 @@ class open_addressing_ref_impl {
    *
    * @return The sentinel value used to represent an erased slot
    */
-  [[nodiscard]] __device__ constexpr value_type const erased_slot_sentinel() const noexcept
+  [[nodiscard]] __device__ constexpr value_type erased_slot_sentinel() const noexcept
   {
     if constexpr (has_payload) {
       return cuco::pair{this->erased_key_sentinel(), this->empty_value_sentinel()};
@@ -1685,8 +1681,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ constexpr insert_result back_to_back_cas(value_type* address,
-                                                                    value_type const& expected,
-                                                                    Value const& desired) noexcept
+                                                                    value_type expected,
+                                                                    Value desired) noexcept
   {
     using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
 
@@ -1736,8 +1732,9 @@ class open_addressing_ref_impl {
    * @return Result of this operation, i.e., success/continue/duplicate
    */
   template <typename Value>
-  [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(
-    value_type* address, value_type const& expected, Value const& desired) noexcept
+  [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(value_type* address,
+                                                                       value_type expected,
+                                                                       Value desired) noexcept
   {
     using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
 
@@ -1778,8 +1775,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ insert_result attempt_insert(value_type* address,
-                                                        value_type const& expected,
-                                                        Value const& desired) noexcept
+                                                        value_type expected,
+                                                        Value desired) noexcept
   {
     if constexpr (sizeof(value_type) <= 8) {
       return packed_cas(address, expected, desired);
@@ -1811,8 +1808,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ insert_result attempt_insert_stable(value_type* address,
-                                                               value_type const& expected,
-                                                               Value const& desired) noexcept
+                                                               value_type expected,
+                                                               Value desired) noexcept
   {
     if constexpr (sizeof(value_type) <= 8) {
       return packed_cas(address, expected, desired);
@@ -1833,7 +1830,7 @@ class open_addressing_ref_impl {
    * @param sentinel The slot sentinel value
    */
   template <typename T>
-  __device__ void wait_for_payload(T& slot, T const& sentinel) const noexcept
+  __device__ void wait_for_payload(T& slot, T sentinel) const noexcept
   {
     auto ref = cuda::atomic_ref<T, Scope>{slot};
     T current;
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()`
`83`	`83`	`* @return If the bits in the object representations of lhs and rhs are identical.`
`84`	`84`	`*/`
`85`	`85`	`template <typename T>`
`86`		`-__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)`
	`86`	`+__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)`
`87`	`87`	`{`
`88`	`88`	`static_assert(`
`89`	`89`	`cuco::is_bitwise_comparable_v<T>,`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ struct slot_is_filled {`
`73`	`73`	`* @param empty_sentinel Key sentinel indicating an empty slot`
`74`	`74`	`* @param erased_sentinel Key sentinel indicating an erased slot`
`75`	`75`	`*/`
`76`		`- explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept`
	`76`	`+ explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept`
`77`	`77`	`: empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}`
`78`	`78`	`{`
`79`	`79`	`}`
`@@ -88,7 +88,7 @@ struct slot_is_filled {`
`88`	`88`	* @return `true` if slot is filled
`89`	`89`	`*/`
`90`	`90`	`template <typename S>`
`91`		`- __device__ constexpr bool operator()(S const& slot) const noexcept`
	`91`	`+ __device__ constexpr bool operator()(S slot) const noexcept`
`92`	`92`	`{`
`93`	`93`	`auto const key = [&]() {`
`94`	`94`	`if constexpr (HasPayload) {`