Matt711
diff --git a/‎conda/environments/all_cuda-118_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-118_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/environments/all_cuda-118_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-118_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/environments/all_cuda-128_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-128_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/environments/all_cuda-128_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-128_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/recipes/cudf/recipe.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/recipes/cudf/recipe.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/include/cudf/aggregation.hpp‎
Lines changed: 60 additions & 41 deletions b/‎cpp/include/cudf/aggregation.hpp‎
Lines changed: 60 additions & 41 deletions
diff --git a/‎cpp/include/cudf/detail/aggregation/aggregation.hpp‎
Lines changed: 48 additions & 0 deletions b/‎cpp/include/cudf/detail/aggregation/aggregation.hpp‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎cpp/include/cudf/detail/utilities/device_operators.cuh‎
Lines changed: 88 additions & 0 deletions b/‎cpp/include/cudf/detail/utilities/device_operators.cuh‎
Lines changed: 88 additions & 0 deletions
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.9.0,!=0.10.0
+- numba-cuda>=0.10.1,<0.11.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 
@@ -56,7 +56,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.9.0,!=0.10.0
+- numba-cuda>=0.10.1,<0.11.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.9.0,!=0.10.0
+- numba-cuda>=0.10.1,<0.11.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 
@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.9.0,!=0.10.0
+- numba-cuda>=0.10.1,<0.11.0a0
 - numba>=0.59.1,<0.62.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 
@@ -73,7 +73,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.9.0,!=0.10.0
+    - numba-cuda >=0.10.1,<0.11.0a0
     - numba >=0.59.1,<0.62.0a0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<20.0.0a0
 
@@ -452,6 +452,7 @@ add_library(
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
+  src/groupby/sort/group_bitwise.cu
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
@@ -633,6 +634,7 @@ add_library(
   src/quantiles/quantiles.cu
   src/reductions/all.cu
   src/reductions/any.cu
+  src/reductions/bitwise.cu
   src/reductions/collect_ops.cu
   src/reductions/histogram.cu
   src/reductions/max.cu
 
@@ -70,6 +70,15 @@ enum class rank_percentage : int32_t {
   ONE_NORMALIZED    ///< (rank - 1) / (count - 1)
 };
 
+/**
+ * @brief Bitwise operations to use for BITWISE_AGG aggregations on numeric columns.
+ */
+enum class bitwise_op : int32_t {
+  AND,  ///< bitwise AND operation
+  OR,   ///< bitwise OR operation
+  XOR   ///< bitwise XOR operation
+};
+
 /**
  * @brief Abstract base class for specifying the desired aggregation in an
  * `aggregation_request`.
@@ -84,44 +93,45 @@ class aggregation {
    * @brief Possible aggregation operations
    */
   enum Kind {
-    SUM,             ///< sum reduction
-    PRODUCT,         ///< product reduction
-    MIN,             ///< min reduction
-    MAX,             ///< max reduction
-    COUNT_VALID,     ///< count number of valid elements
-    COUNT_ALL,       ///< count number of elements
-    ANY,             ///< any reduction
-    ALL,             ///< all reduction
-    SUM_OF_SQUARES,  ///< sum of squares reduction
-    MEAN,            ///< arithmetic mean reduction
-    M2,              ///< sum of squares of differences from the mean
-    VARIANCE,        ///< variance
-    STD,             ///< standard deviation
-    MEDIAN,          ///< median reduction
-    QUANTILE,        ///< compute specified quantile(s)
-    ARGMAX,          ///< Index of max element
-    ARGMIN,          ///< Index of min element
-    NUNIQUE,         ///< count number of unique elements
-    NTH_ELEMENT,     ///< get the nth element
-    ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    EWMA,            ///< get exponential weighted moving average at current index
-    RANK,            ///< get rank of current index
-    COLLECT_LIST,    ///< collect values into a list
-    COLLECT_SET,     ///< collect values into a list without duplicate entries
-    LEAD,            ///< window function, accesses row at specified offset following current row
-    LAG,             ///< window function, accesses row at specified offset preceding current row
-    PTX,             ///< PTX  based UDF aggregation
-    CUDA,            ///< CUDA based UDF aggregation
-    HOST_UDF,        ///< host based UDF aggregation
-    MERGE_LISTS,     ///< merge multiple lists values into one list
-    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2,        ///< merge partial values of M2 aggregation,
-    COVARIANCE,      ///< covariance between two sets of elements
-    CORRELATION,     ///< correlation between two sets of elements
-    TDIGEST,         ///< create a tdigest from a set of input values
-    MERGE_TDIGEST,   ///< create a tdigest by merging multiple tdigests together
-    HISTOGRAM,       ///< compute frequency of each element
-    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation
+    SUM,              ///< sum reduction
+    PRODUCT,          ///< product reduction
+    MIN,              ///< min reduction
+    MAX,              ///< max reduction
+    COUNT_VALID,      ///< count number of valid elements
+    COUNT_ALL,        ///< count number of elements
+    ANY,              ///< any reduction
+    ALL,              ///< all reduction
+    SUM_OF_SQUARES,   ///< sum of squares reduction
+    MEAN,             ///< arithmetic mean reduction
+    M2,               ///< sum of squares of differences from the mean
+    VARIANCE,         ///< variance
+    STD,              ///< standard deviation
+    MEDIAN,           ///< median reduction
+    QUANTILE,         ///< compute specified quantile(s)
+    ARGMAX,           ///< Index of max element
+    ARGMIN,           ///< Index of min element
+    NUNIQUE,          ///< count number of unique elements
+    NTH_ELEMENT,      ///< get the nth element
+    ROW_NUMBER,       ///< get row-number of current index (relative to rolling window)
+    EWMA,             ///< get exponential weighted moving average at current index
+    RANK,             ///< get rank of current index
+    COLLECT_LIST,     ///< collect values into a list
+    COLLECT_SET,      ///< collect values into a list without duplicate entries
+    LEAD,             ///< window function, accesses row at specified offset following current row
+    LAG,              ///< window function, accesses row at specified offset preceding current row
+    PTX,              ///< PTX  based UDF aggregation
+    CUDA,             ///< CUDA based UDF aggregation
+    HOST_UDF,         ///< host based UDF aggregation
+    MERGE_LISTS,      ///< merge multiple lists values into one list
+    MERGE_SETS,       ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_M2,         ///< merge partial values of M2 aggregation,
+    COVARIANCE,       ///< covariance between two sets of elements
+    CORRELATION,      ///< correlation between two sets of elements
+    TDIGEST,          ///< create a tdigest from a set of input values
+    MERGE_TDIGEST,    ///< create a tdigest by merging multiple tdigests together
+    HISTOGRAM,        ///< compute frequency of each element
+    MERGE_HISTOGRAM,  ///< merge partial values of HISTOGRAM aggregation
+    BITWISE_AGG       ///< bitwise aggregation on numeric columns
   };
 
   aggregation() = delete;
@@ -783,12 +793,21 @@ std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
 template <typename Base>
 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
 
+/**
+ * @brief Factory to create a BITWISE_AGG aggregation.
+ *
+ * @param op The bitwise operation to perform on the input column
+ * @return A BITWISE_AGG aggregation object
+ */
+template <typename Base>
+std::unique_ptr<Base> make_bitwise_aggregation(bitwise_op op);
+
 /**
  * @brief Indicate if an aggregation is supported for a source datatype.
  *
- * @param source Type of the column to perform the aggregation on.
- * @param kind The kind of the aggregation.
- * @returns true if the aggregation is supported.
+ * @param source Type of the column to perform the aggregation on
+ * @param kind The kind of the aggregation
+ * @returns true if the aggregation is supported
  */
 bool is_valid_aggregation(data_type source, aggregation::Kind kind);
 
 
@@ -107,6 +107,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class tdigest_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
     data_type col_type, class merge_tdigest_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class bitwise_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -148,6 +150,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class tdigest_aggregation const& agg);
   virtual void visit(class merge_tdigest_aggregation const& agg);
   virtual void visit(class ewma_aggregation const& agg);
+  virtual void visit(class bitwise_aggregation const& agg);
 };
 
 /**
@@ -1221,6 +1224,41 @@ class merge_tdigest_aggregation final : public groupby_aggregation, public reduc
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying BITWISE_AGG aggregation.
+ */
+class bitwise_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  explicit bitwise_aggregation(bitwise_op bit_op_) : aggregation{BITWISE_AGG}, bit_op{bit_op_} {}
+
+  bitwise_op bit_op;
+
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<bitwise_aggregation const&>(_other);
+    return bit_op == other.bit_op;
+  }
+
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ static_cast<size_t>(bit_op);
+  }
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<bitwise_aggregation>(*this);
+  }
+
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -1503,6 +1541,14 @@ struct target_type_impl<SourceType, aggregation::HOST_UDF> {
   using type = struct_view;
 };
 
+// BITWISE_AGG returns the same type as input for integral types.
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::BITWISE_AGG,
+                        std::enable_if_t<std::is_integral_v<Source>>> {
+  using type = Source;
+};
+
 /**
  * @brief Helper alias to get the accumulator type for performing aggregation
  * `k` on elements of type `Source`
@@ -1622,6 +1668,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::EWMA>(std::forward<Ts>(args)...);
     case aggregation::HOST_UDF:
       return f.template operator()<aggregation::HOST_UDF>(std::forward<Ts>(args)...);
+    case aggregation::BITWISE_AGG:
+      return f.template operator()<aggregation::BITWISE_AGG>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
 
@@ -251,4 +251,92 @@ struct DeviceLeadLag {
   explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
 };
 
+/**
+ * @brief Binary bitwise `AND` operator
+ */
+struct DeviceBitAnd {
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE inline T operator()(T const& lhs, T const& rhs) const
+  {
+    return lhs & rhs;
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+    if constexpr (std::is_same_v<T, bool>) {
+      return true;
+    } else {
+      return ~T{0};
+    }
+  }
+
+  template <typename T, std::enable_if_t<!std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Bitwise AND is only supported for integral types.");
+#else
+    CUDF_UNREACHABLE("Bitwise AND is only supported for integral types.");
+#endif
+    return T{};
+  }
+};
+
+/**
+ * @brief Binary bitwise `OR` operator
+ */
+struct DeviceBitOr {
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE inline T operator()(T const& lhs, T const& rhs) const
+  {
+    return lhs | rhs;
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+    return T{0};
+  }
+
+  template <typename T, std::enable_if_t<!std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Bitwise OR is only supported for integral types.");
+#else
+    CUDF_UNREACHABLE("Bitwise OR is only supported for integral types.");
+#endif
+    return T{};
+  }
+};
+
+/**
+ * @brief Binary bitwise `XOR` operator
+ */
+struct DeviceBitXor {
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE inline T operator()(T const& lhs, T const& rhs) const
+  {
+    return lhs ^ rhs;
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+    return T{0};
+  }
+
+  template <typename T, std::enable_if_t<!std::is_integral_v<T>>* = nullptr>
+  CUDF_HOST_DEVICE static constexpr T identity()
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Bitwise XOR is only supported for integral types.");
+#else
+    CUDF_UNREACHABLE("Bitwise XOR is only supported for integral types.");
+#endif
+    return T{};
+  }
+};
+
 }  // namespace cudf