NVIDIA
diff --git a/‎ci/build.sh‎
Lines changed: 5 additions & 5 deletions b/‎ci/build.sh‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 4 additions & 3 deletions b/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions b/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/cuco/detail/hash_functions/murmurhash3.cuh‎
Lines changed: 70 additions & 46 deletions b/‎include/cuco/detail/hash_functions/murmurhash3.cuh‎
Lines changed: 70 additions & 46 deletions
diff --git a/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 4 additions & 2 deletions b/‎include/cuco/detail/open_addressing/functors.cuh‎
Lines changed: 4 additions & 2 deletions
@@ -131,16 +131,16 @@ while [ "${#args[@]}" -ne 0 ]; do
     esac
 done
 
+if [ $VERBOSE ]; then
+    set -x
+fi
+
 # Convert to full paths:
 HOST_COMPILER=$(which ${HOST_COMPILER})
 CUDA_COMPILER=$(which ${CUDA_COMPILER})
 # Make CUDA arch list compatible with cmake
 CUDA_ARCHS=$(echo "$CUDA_ARCHS" | tr ' ,' ';;')
 
-if [ $VERBOSE ]; then
-    set -x
-fi
-
 # Begin processing unsets after option parsing
 set -u
 
@@ -217,4 +217,4 @@ if command -v sccache >/dev/null; then
     source "./sccache_stats.sh" end
 else
     echo "sccache stats: N/A"
-fi
+fi
@@ -183,7 +183,8 @@ class bloom_filter_impl {
     // If single thread is optimal, use scalar add
     if constexpr (worker_num_threads == 1) {
       for (auto i = rank; i < num_keys; i += num_threads) {
-        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + i)};
+        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+          *(first + i)};
         this->add(insert_element);
       }
     } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
@@ -193,7 +194,7 @@ class bloom_filter_impl {
       auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
       for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
         if (i + rank < num_keys) {
-          typename std::iterator_traits<InputIt>::value_type const& insert_element{
+          typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
             *(first + i + rank)};
           hash_value  = policy_.hash(insert_element);
           block_index = policy_.block_index(hash_value, num_blocks_);
@@ -214,7 +215,7 @@ class bloom_filter_impl {
 
       for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
         if (i + rank < num_keys) {
-          typename std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
+          typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
           hash_value  = policy_.hash(key);
           block_index = policy_.block_index(hash_value, num_blocks_);
         }
 
@@ -17,10 +17,11 @@
 
 #include <cuco/detail/utility/cuda.cuh>
 
+#include <cuda/std/iterator>
+
 #include <cooperative_groups.h>
 
 #include <cstdint>
-#include <iterator>
 
 namespace cuco::detail::bloom_filter_ns {
 
@@ -66,7 +67,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        *(first + idx)};
       ref.add(tile, insert_element);
     }
     idx += loop_stride;
@@ -96,14 +98,14 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
 
   if constexpr (CGSize == 1) {
     while (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
       idx += loop_stride;
     }
   } else {
     auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
     while (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
       if (tile.thread_rank() == 0) { *(out + idx) = found; }
       idx += loop_stride;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,16 +28,70 @@
 namespace cuco::detail {
 
 /**
- * @brief The 32bit integer finalizer hash function of `MurmurHash3`.
+ * @brief The 32-bit integer finalizer function of `MurmurHash3`.
+ *
+ * This function implements the final mixing step of the `MurmurHash3` algorithm for 32-bit values.
+ * It is designed to improve the avalanche behavior of the hash, ensuring that changes in input bits
+ * have a more uniform effect on all output bits.
  *
  * @throw Key type must be 4 bytes in size
  *
- * @tparam Key The type of the values to hash
+ * @tparam Key The type of the value to finalize
+ *
+ * @param key The input value to finalize
+ * @param seed Optional seed value
+ * @return The finalized 32-bit hash value
  */
 template <typename Key>
-struct MurmurHash3_fmix32 {
+__host__ __device__ constexpr std::uint32_t fmix32(Key key, std::uint32_t seed = 0) noexcept
+{
   static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size.");
 
+  std::uint32_t h = static_cast<std::uint32_t>(key) ^ seed;
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+  return h;
+}
+
+/**
+ * @brief The 64-bit integer finalizer function of `MurmurHash3`.
+ *
+ * This function implements the final mixing step of the `MurmurHash3` algorithm for 64-bit values.
+ * It is designed to improve the avalanche behavior of the hash, ensuring that changes in input bits
+ * have a more uniform effect on all output bits.
+ *
+ * @throw Key type must be 8 bytes in size
+ *
+ * @tparam Key The type of the value to finalize
+ *
+ * @param key The input value to finalize
+ * @param seed Optional seed value
+ * @return The finalized 64-bit hash value
+ */
+template <typename Key>
+__host__ __device__ constexpr std::uint64_t fmix64(Key key, std::uint64_t seed = 0) noexcept
+{
+  static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size.");
+
+  std::uint64_t h = static_cast<std::uint64_t>(key) ^ seed;
+  h ^= h >> 33;
+  h *= 0xff51afd7ed558ccdULL;
+  h ^= h >> 33;
+  h *= 0xc4ceb9fe1a85ec53ULL;
+  h ^= h >> 33;
+  return h;
+}
+
+/**
+ * @brief The 32bit integer finalizer hash function of `MurmurHash3`.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct MurmurHash3_fmix32 {
   using argument_type = Key;            ///< The type of the values taken as argument
   using result_type   = std::uint32_t;  ///< The type of the hash values produced
 
@@ -56,13 +110,7 @@ struct MurmurHash3_fmix32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    std::uint32_t h = static_cast<std::uint32_t>(key) ^ seed_;
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
+    return fmix32(key, seed_);
   }
 
  private:
@@ -72,14 +120,10 @@ struct MurmurHash3_fmix32 {
 /**
  * @brief The 64bit integer finalizer hash function of `MurmurHash3`.
  *
- * @throw Key type must be 8 bytes in size
- *
  * @tparam Key The type of the values to hash
  */
 template <typename Key>
 struct MurmurHash3_fmix64 {
-  static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size.");
-
   using argument_type = Key;            ///< The type of the values taken as argument
   using result_type   = std::uint64_t;  ///< The type of the hash values produced
 
@@ -98,13 +142,7 @@ struct MurmurHash3_fmix64 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    std::uint64_t h = static_cast<std::uint64_t>(key) ^ seed_;
-    h ^= h >> 33;
-    h *= 0xff51afd7ed558ccd;
-    h ^= h >> 33;
-    h *= 0xc4ceb9fe1a85ec53;
-    h ^= h >> 33;
-    return h;
+    return fmix64(key, seed_);
   }
 
  private:
@@ -136,7 +174,7 @@ struct MurmurHash3_32 {
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {}
+  __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -199,7 +237,7 @@ struct MurmurHash3_32 {
     //----------
     // finalization
     h1 ^= size;
-    h1 = fmix32_(h1);
+    h1 = fmix32(h1);
     return h1;
   }
 
@@ -224,12 +262,6 @@ struct MurmurHash3_32 {
   }
 
  private:
-  constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept
-  {
-    return (x << r) | (x >> (32 - r));
-  }
-
-  MurmurHash3_fmix32<std::uint32_t> fmix32_;
   std::uint32_t seed_;
 };
 
@@ -258,10 +290,7 @@ struct MurmurHash3_x64_128 {
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_x64_128(std::uint64_t seed = 0)
-    : fmix64_{0}, seed_{seed}
-  {
-  }
+  __host__ __device__ constexpr MurmurHash3_x64_128(std::uint64_t seed = 0) : seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -362,8 +391,8 @@ struct MurmurHash3_x64_128 {
     h1 += h2;
     h2 += h1;
 
-    h1 = fmix64_(h1);
-    h2 = fmix64_(h2);
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
 
     h1 += h2;
     h2 += h1;
@@ -392,7 +421,6 @@ struct MurmurHash3_x64_128 {
   }
 
  private:
-  MurmurHash3_fmix64<std::uint64_t> fmix64_;
   std::uint64_t seed_;
 };
 
@@ -421,10 +449,7 @@ struct MurmurHash3_x86_128 {
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_x86_128(std::uint32_t seed = 0)
-    : fmix32_{0}, seed_{seed}
-  {
-  }
+  __host__ __device__ constexpr MurmurHash3_x86_128(std::uint32_t seed = 0) : seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -573,10 +598,10 @@ struct MurmurHash3_x86_128 {
     h3 += h1;
     h4 += h1;
 
-    h1 = fmix32_(h1);
-    h2 = fmix32_(h2);
-    h3 = fmix32_(h3);
-    h4 = fmix32_(h4);
+    h1 = fmix32(h1);
+    h2 = fmix32(h2);
+    h3 = fmix32(h3);
+    h4 = fmix32(h4);
 
     h1 += h2;
     h1 += h3;
@@ -609,7 +634,6 @@ struct MurmurHash3_x86_128 {
   }
 
  private:
-  MurmurHash3_fmix32<std::uint32_t> fmix32_;
   std::uint32_t seed_;
 };
 
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/pair/traits.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuco::detail::open_addressing_ns {
 
 /**
@@ -49,7 +51,7 @@ struct get_slot {
     auto const intra_idx  = idx % StorageRef::bucket_size;
     if constexpr (HasPayload) {
       auto const [first, second] = storage_[bucket_idx][intra_idx];
-      return thrust::make_tuple(first, second);
+      return thrust::tuple{first, second};
     } else {
       return storage_[bucket_idx][intra_idx];
     }