NVIDIA
diff --git a/‎include/cuco/detail/hyperloglog/hyperloglog.inl‎
Lines changed: 9 additions & 1 deletion b/‎include/cuco/detail/hyperloglog/hyperloglog.inl‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎include/cuco/detail/hyperloglog/hyperloglog_impl.cuh‎
Lines changed: 78 additions & 43 deletions b/‎include/cuco/detail/hyperloglog/hyperloglog_impl.cuh‎
Lines changed: 78 additions & 43 deletions
diff --git a/‎include/cuco/detail/hyperloglog/hyperloglog_ref.inl‎
Lines changed: 9 additions & 1 deletion b/‎include/cuco/detail/hyperloglog/hyperloglog_ref.inl‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎include/cuco/detail/hyperloglog/kernels.cuh‎
Lines changed: 52 additions & 50 deletions b/‎include/cuco/detail/hyperloglog/kernels.cuh‎
Lines changed: 52 additions & 50 deletions
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,6 +79,14 @@ constexpr void hyperloglog<T, Scope, Hash, Allocator>::add(InputIt first,
   ref_.add(first, last, stream);
 }
 
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class InputIt, class StencilIt, class Predicate>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::add_if_async(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
+{
+  ref_.add_if_async(first, last, stencil, pred, stream);
+}
+
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
 constexpr void hyperloglog<T, Scope, Hash, Allocator>::merge_async(
 
@@ -26,12 +26,14 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/atomic>
+#include <cuda/functional>
 #include <cuda/std/__algorithm/max.h>  // TODO #include <cuda/std/algorithm> once available
 #include <cuda/std/bit>
 #include <cuda/std/cstddef>
 #include <cuda/std/span>
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cooperative_groups.h>
@@ -172,6 +174,60 @@ class hyperloglog_impl {
    */
   template <class InputIt>
   __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
+  {
+    auto const always_true = thrust::constant_iterator<bool>(true);
+    this->add_if_async(first, last, always_true, cuda::std::identity{}, stream);
+  }
+
+  /**
+   * @brief Adds to be counted items to the estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `add_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt>
+  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
+  {
+    this->add_async(first, last, stream);
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
+  }
+
+  /**
+   * @brief Asynchronously adds items in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The item `*(first + i)` is added if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt, class StencilIt, class Predicate>
+  __host__ constexpr void add_if_async(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
   {
     auto const num_items = cuco::detail::distance(first, last);
     if (num_items == 0) { return; }
@@ -181,8 +237,6 @@ class hyperloglog_impl {
     int const shmem_bytes = sketch_bytes();
     void const* kernel    = nullptr;
 
-    // In case the input iterator represents a contiguous memory segment we can employ efficient
-    // vectorized loads
     if constexpr (thrust::is_contiguous_iterator_v<InputIt>) {
       auto const ptr                  = thrust::raw_pointer_cast(&first[0]);
       auto constexpr max_vector_bytes = 32;
@@ -193,54 +247,60 @@ class hyperloglog_impl {
       switch (vector_size) {
         case 2:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_impl>);
+            cuco::hyperloglog_ns::detail::
+              add_if_shmem_vectorized<2, StencilIt, Predicate, hyperloglog_impl>);
           break;
         case 4:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_impl>);
+            cuco::hyperloglog_ns::detail::
+              add_if_shmem_vectorized<4, StencilIt, Predicate, hyperloglog_impl>);
           break;
         case 8:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_impl>);
+            cuco::hyperloglog_ns::detail::
+              add_if_shmem_vectorized<8, StencilIt, Predicate, hyperloglog_impl>);
           break;
         case 16:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_impl>);
+            cuco::hyperloglog_ns::detail::
+              add_if_shmem_vectorized<16, StencilIt, Predicate, hyperloglog_impl>);
           break;
       };
     }
 
     if (kernel != nullptr and this->try_reserve_shmem(kernel, shmem_bytes)) {
       if constexpr (thrust::is_contiguous_iterator_v<InputIt>) {
-        // We make use of the occupancy calculator to get the minimum number of blocks which still
-        // saturates the GPU. This reduces the shmem initialization overhead and atomic contention
-        // on the final register array during the merge phase.
         CUCO_CUDA_TRY(
           cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
 
         auto const ptr      = thrust::raw_pointer_cast(&first[0]);
-        void* kernel_args[] = {
-          (void*)(&ptr),  // TODO can't use reinterpret_cast since it can't cast away const
-          (void*)(&num_items),
-          reinterpret_cast<void*>(this)};
+        void* kernel_args[] = {(void*)(&ptr),
+                               (void*)(&num_items),
+                               (void*)(&stencil),
+                               (void*)(&pred),
+                               reinterpret_cast<void*>(this)};
         CUCO_CUDA_TRY(
           cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get()));
       }
     } else {
       kernel = reinterpret_cast<void const*>(
-        cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_impl>);
-      void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast<void*>(this)};
+        cuco::hyperloglog_ns::detail::
+          add_if_shmem<InputIt, StencilIt, Predicate, hyperloglog_impl>);
+      void* kernel_args[] = {(void*)(&first),
+                             (void*)(&num_items),
+                             (void*)(&stencil),
+                             (void*)(&pred),
+                             reinterpret_cast<void*>(this)};
       if (this->try_reserve_shmem(kernel, shmem_bytes)) {
         CUCO_CUDA_TRY(
           cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
 
         CUCO_CUDA_TRY(
           cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get()));
       } else {
-        // Computes sketch directly in global memory. (Fallback path in case there is not enough
-        // shared memory avalable)
         kernel = reinterpret_cast<void const*>(
-          cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_impl>);
+          cuco::hyperloglog_ns::detail::
+            add_if_gmem<InputIt, StencilIt, Predicate, hyperloglog_impl>);
 
         CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0));
 
@@ -250,31 +310,6 @@ class hyperloglog_impl {
     }
   }
 
-  /**
-   * @brief Adds to be counted items to the estimator.
-   *
-   * @note This function synchronizes the given stream. For asynchronous execution use
-   * `add_async`.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * T></tt> is `true`
-   *
-   * @param first Beginning of the sequence of items
-   * @param last End of the sequence of items
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <class InputIt>
-  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
-  {
-    this->add_async(first, last, stream);
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-    stream.sync();
-#else
-    stream.wait();
-#endif
-  }
-
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,6 +67,14 @@ __host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,
   impl_.add(first, last, stream);
 }
 
+template <class T, cuda::thread_scope Scope, class Hash>
+template <class InputIt, class StencilIt, class Predicate>
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add_if_async(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
+{
+  impl_.add_if_async(first, last, stencil, pred, stream);
+}
+
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
 __device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,17 +36,55 @@ CUCO_KERNEL void clear(RefType ref)
   if (block.group_index().x == 0) { ref.clear(block); }
 }
 
-template <int32_t VectorSize, class RefType>
-CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
-                                      cuco::detail::index_type n,
-                                      RefType ref)
+template <class InputIt, class StencilIt, class Predicate, class RefType>
+CUCO_KERNEL void add_if_gmem(
+  InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, RefType ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+
+  while (idx < n) {
+    if (pred(*(stencil + idx))) { ref.add(*(first + idx)); }
+    idx += loop_stride;
+  }
+}
+
+template <class InputIt, class StencilIt, class Predicate, class RefType>
+CUCO_KERNEL void add_if_shmem(
+  InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, RefType ref)
+{
+  using local_ref_type = typename RefType::template with_scope<cuda::thread_scope_block>;
+
+  extern __shared__ cuda::std::byte local_sketch[];
+
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+  auto const block       = cooperative_groups::this_thread_block();
+
+  local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
+  local_ref.clear(block);
+  block.sync();
+
+  while (idx < n) {
+    if (pred(*(stencil + idx))) { local_ref.add(*(first + idx)); }
+    idx += loop_stride;
+  }
+  block.sync();
+
+  ref.merge(block, local_ref);
+}
+
+template <int32_t VectorSize, class StencilIt, class Predicate, class RefType>
+CUCO_KERNEL void add_if_shmem_vectorized(typename RefType::value_type const* first,
+                                         cuco::detail::index_type n,
+                                         StencilIt stencil,
+                                         Predicate pred,
+                                         RefType ref)
 {
   using value_type     = typename RefType::value_type;
   using vector_type    = cuda::std::array<value_type, VectorSize>;
   using local_ref_type = typename RefType::template with_scope<cuda::thread_scope_block>;
 
-  // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is
-  // sufficient for this purpose
   extern __shared__ cuda::std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
@@ -58,29 +96,30 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
   local_ref.clear(block);
   block.sync();
 
-  // each thread processes VectorSize-many items per iteration
   vector_type vec;
   while (idx < n / VectorSize) {
     vec = *reinterpret_cast<vector_type*>(
       __builtin_assume_aligned(first + idx * VectorSize, sizeof(vector_type)));
-    for (auto const& i : vec) {
-      local_ref.add(i);
+    for (auto i = 0; i < VectorSize; ++i) {
+      if (pred(*(stencil + idx * VectorSize + i))) { local_ref.add(vec[i]); }
     }
     idx += loop_stride;
   }
-  // a single thread processes the remaining items
+
 #if defined(CUCO_HAS_CG_INVOKE_ONE)
   cooperative_groups::invoke_one(grid, [&]() {
     auto const remainder = n % VectorSize;
     cuda::static_for<VectorSize>([&] __device__(auto i) {
-      if (i() < remainder) { local_ref.add(*(first + n - i() - 1)); }
+      auto const item_idx = n - i() - 1;
+      if (i() < remainder && pred(*(stencil + item_idx))) { local_ref.add(*(first + item_idx)); }
     });
   });
 #else
   if (grid.thread_rank() == 0) {
     auto const remainder = n % VectorSize;
     cuda::static_for<VectorSize>([&] __device__(auto i) {
-      if (i() < remainder) { local_ref.add(*(first + n - i() - 1)); }
+      auto const item_idx = n - i() - 1;
+      if (i() < remainder && pred(*(stencil + item_idx))) { local_ref.add(*(first + item_idx)); }
     });
   }
 #endif
@@ -89,43 +128,6 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
   ref.merge(block, local_ref);
 }
 
-template <class InputIt, class RefType>
-CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
-{
-  using local_ref_type = typename RefType::template with_scope<cuda::thread_scope_block>;
-
-  // TODO assert alignment
-  extern __shared__ cuda::std::byte local_sketch[];
-
-  auto const loop_stride = cuco::detail::grid_stride();
-  auto idx               = cuco::detail::global_thread_id();
-  auto const block       = cooperative_groups::this_thread_block();
-
-  local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
-  local_ref.clear(block);
-  block.sync();
-
-  while (idx < n) {
-    local_ref.add(*(first + idx));
-    idx += loop_stride;
-  }
-  block.sync();
-
-  ref.merge(block, local_ref);
-}
-
-template <class InputIt, class RefType>
-CUCO_KERNEL void add_gmem(InputIt first, cuco::detail::index_type n, RefType ref)
-{
-  auto const loop_stride = cuco::detail::grid_stride();
-  auto idx               = cuco::detail::global_thread_id();
-
-  while (idx < n) {
-    ref.add(*(first + idx));
-    idx += loop_stride;
-  }
-}
-
 template <class OtherRefType, class RefType>
 CUCO_KERNEL void merge(OtherRefType other_ref, RefType ref)
 {