From 9aa47e1b3a386321fdb90bb8a65a068450fc10f5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 25 Nov 2025 16:34:50 -0800 Subject: [PATCH 1/9] Implement bypass for intermediate aggregations Signed-off-by: Nghia Truong --- .../groupby/hash/compute_single_pass_aggs.cuh | 2 +- .../groupby/hash/extract_single_pass_aggs.cpp | 44 ++++++++++++------- .../groupby/hash/extract_single_pass_aggs.hpp | 16 +++++-- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 510c656ea05..7791e3d8922 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -38,7 +38,7 @@ std::pair, bool> compute_single_pass_aggs( { // Collect the single-pass aggregations that can be processed in this function. // The compound aggregations that require multiple passes will be handled separately later on. - auto const [values, agg_kinds, aggs, has_compound_aggs] = + auto const [values, agg_kinds, aggs, force_non_nullable, has_compound_aggs] = extract_single_pass_aggs(requests, stream); auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, cudf::get_current_device_resource_ref()); diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp index 242920d2c11..4ee5af7a0b0 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp @@ -92,47 +92,53 @@ class groupby_simple_aggregations_collector final return aggs; } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } }; std::tuple, std::vector>, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; + std::vector force_non_nullable; auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); bool has_compound_aggs = false; for (auto const& request : requests) { - auto const& agg_v = request.aggregations; + auto const& input_aggs = request.aggregations; + + // The set of input aggregations. + std::unordered_set input_agg_kinds_set; + for (auto const& agg : input_aggs) { + input_agg_kinds_set.insert(agg->kind); + } + // The aggregations extracted from the input request, including the original single-pass + // aggregations and the intermediate single-pass aggregations replacing compound + // aggregations. std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { if (agg_kinds_set.insert(agg->kind).second) { + // Check if the inserted aggregation is an input aggregation or a replacement aggregation. + // If it is not an input aggregation, we can force its output to be non-nullable. + auto const is_input_agg = input_agg_kinds_set.contains(agg->kind); + force_non_nullable.push_back(!is_input_agg); + agg_kinds.push_back(agg->kind); aggs.push_back(std::move(agg)); columns.push_back(request_values); } }; - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto const& agg : agg_v) { + auto const values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto const& agg : input_aggs) { groupby_simple_aggregations_collector collector; auto spass_aggs = agg->get_simple_aggregations(values_type, collector); if (spass_aggs.size() > 1 || !spass_aggs.front()->is_equal(*agg)) { @@ -145,7 +151,11 @@ extract_single_pass_aggs(host_span requests, } } - return {table_view(columns), std::move(agg_kinds), std::move(aggs), has_compound_aggs}; + return {table_view(columns), + std::move(agg_kinds), + std::move(aggs), + std::move(force_non_nullable), + has_compound_aggs}; } std::vector get_simple_aggregations(groupby_aggregation const& agg, diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp index 158e566f474..b9bf261e51b 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp @@ -15,9 +15,16 @@ namespace cudf::groupby::detail::hash { /** - * @brief Extract all compound aggregations into single pass aggs + * @brief Extract single pass aggregations from the given aggregation requests. * - * For example, a MEAN aggregation will be flattened into a SUM and a COUNT_VALID aggregation. + * During extraction, compound aggregations will be replaced by their corresponding single pass + * aggregations dependencies. For example, a MEAN aggregation will be replaced by a SUM and a + * COUNT_VALID aggregation. + * + * For some single-pass aggregations, we also try to reduce overhead by forcing their results + * columns to be non-nullable. For example, a SUM aggregation needed only as the intermediate result + * for M2 aggregation will not need to have a nullmask to avoid the extra nullmask update and null + * count computation overhead. * * @param requests The aggregation requests * @param stream The CUDA stream @@ -25,12 +32,15 @@ namespace cudf::groupby::detail::hash { * @return A tuple containing: * - A table_view containing the input values columns for the single-pass aggregations, * - A vector of aggregation kinds corresponding to each of these values columns, - * - A vector of aggregation objects corresponding to each of these values columns, and + * - A vector of aggregation objects corresponding to each of these values columns, + * - A vector of boolean indicating if the corresponding result will be forced to be + * non-nullable, and * - A boolean value indicating if there are any multi-pass (compound) aggregations. */ std::tuple, std::vector>, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream); From a5309875c5da0e165fc5e50f165e95d013388222 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 25 Nov 2025 21:32:25 -0800 Subject: [PATCH 2/9] Avoid repeated computing null mask and null count Signed-off-by: Nghia Truong --- .../hash/compute_global_memory_aggs.cu | 1 + .../hash/compute_global_memory_aggs.cuh | 34 ++++++++++++++---- .../hash/compute_global_memory_aggs.hpp | 1 + .../hash/compute_global_memory_aggs_null.cu | 1 + .../groupby/hash/compute_single_pass_aggs.cuh | 6 ++-- .../groupby/hash/extract_single_pass_aggs.cpp | 9 ++--- .../groupby/hash/extract_single_pass_aggs.hpp | 4 +-- .../hash/hash_compound_agg_finalizer.cu | 36 +++++++++++++++---- cpp/src/groupby/hash/output_utils.cu | 26 +++++++++----- cpp/src/groupby/hash/output_utils.hpp | 3 ++ 10 files changed, 91 insertions(+), 30 deletions(-) diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index 588cd6d1cbf..fbb18cabe7c 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -13,6 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmask, global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 8fab0d3c864..6f4bf9aca6d 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -72,6 +72,7 @@ std::pair, rmm::device_uvector> compute_aggs_d SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -88,8 +89,12 @@ std::pair, rmm::device_uvector> compute_aggs_d }(); auto const d_values = table_device_view::create(values, stream); - auto agg_results = create_results_table( - static_cast(unique_keys.size()), values, h_agg_kinds, stream, mr); + auto agg_results = create_results_table(static_cast(unique_keys.size()), + values, + h_agg_kinds, + force_non_nullable, + stream, + mr); auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); thrust::for_each_n(rmm::exec_policy_nosync(stream), @@ -116,12 +121,14 @@ std::pair, rmm::device_uvector> compute_aggs_s SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto const num_rows = values.num_rows(); auto const d_values = table_device_view::create(values, stream); - auto agg_results = create_results_table(num_rows, values, h_agg_kinds, stream, mr); + auto agg_results = + create_results_table(num_rows, values, h_agg_kinds, force_non_nullable, stream, mr); auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); thrust::for_each_n( @@ -153,14 +160,27 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { return h_agg_kinds.size() > GROUPBY_DENSE_OUTPUT_THRESHOLD - ? compute_aggs_dense_output( - row_bitmask, values, key_set, h_agg_kinds, d_agg_kinds, stream, mr) - : compute_aggs_sparse_output_gather( - row_bitmask, values, key_set, h_agg_kinds, d_agg_kinds, stream, mr); + ? compute_aggs_dense_output(row_bitmask, + values, + key_set, + h_agg_kinds, + d_agg_kinds, + force_non_nullable, + stream, + mr) + : compute_aggs_sparse_output_gather(row_bitmask, + values, + key_set, + h_agg_kinds, + d_agg_kinds, + force_non_nullable, + stream, + mr); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 69aa14ab200..f6edaad0528 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -20,6 +20,7 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 9d39870627c..a7a5a787984 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -13,6 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmas nullable_global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 7791e3d8922..d09202eafa5 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -50,7 +50,7 @@ std::pair, bool> compute_single_pass_aggs( // present. auto const run_aggs_by_global_mem_kernel = [&] { auto [agg_results, unique_key_indices] = compute_global_memory_aggs( - row_bitmask, values, global_set, agg_kinds, d_agg_kinds, stream, mr); + row_bitmask, values, global_set, agg_kinds, d_agg_kinds, force_non_nullable, stream, mr); finalize_output(values, aggs, agg_results, cache, stream); return std::pair{std::move(unique_key_indices), has_compound_aggs}; }; @@ -210,8 +210,8 @@ std::pair, bool> compute_single_pass_aggs( key_transform_map = rmm::device_uvector{0, stream}; // done, free up memory early auto const d_spass_values = table_device_view::create(values, stream); - auto agg_results = - create_results_table(static_cast(unique_keys.size()), values, agg_kinds, stream, mr); + auto agg_results = create_results_table( + static_cast(unique_keys.size()), values, agg_kinds, force_non_nullable, stream, mr); auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); compute_shared_memory_aggs(grid_size, diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp index 4ee5af7a0b0..2b3c53313ca 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp @@ -97,14 +97,14 @@ class groupby_simple_aggregations_collector final std::tuple, std::vector>, - std::vector, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; - std::vector force_non_nullable; + std::vector force_non_nullable; auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); bool has_compound_aggs = false; @@ -125,9 +125,10 @@ extract_single_pass_aggs(host_span requests, auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { if (agg_kinds_set.insert(agg->kind).second) { // Check if the inserted aggregation is an input aggregation or a replacement aggregation. - // If it is not an input aggregation, we can force its output to be non-nullable. + // If it is not an input aggregation, we can force its output to be non-nullable + // (by storing `1` value in the `force_non_nullable` vector). auto const is_input_agg = input_agg_kinds_set.contains(agg->kind); - force_non_nullable.push_back(!is_input_agg); + force_non_nullable.push_back(is_input_agg ? 0 : 1); agg_kinds.push_back(agg->kind); aggs.push_back(std::move(agg)); diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp index b9bf261e51b..d7208475ae6 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp @@ -33,14 +33,14 @@ namespace cudf::groupby::detail::hash { * - A table_view containing the input values columns for the single-pass aggregations, * - A vector of aggregation kinds corresponding to each of these values columns, * - A vector of aggregation objects corresponding to each of these values columns, - * - A vector of boolean indicating if the corresponding result will be forced to be + * - A vector of binary values indicating if the corresponding result will be forced to be * non-nullable, and * - A boolean value indicating if there are any multi-pass (compound) aggregations. */ std::tuple, std::vector>, - std::vector, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream); diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index b2ce885c388..38779008461 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -81,18 +83,40 @@ void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation const& ag { if (cache->has_result(col, agg)) { return; } - auto const sum_agg = make_sum_aggregation(); - auto const count_agg = make_count_aggregation(); - auto const sum_result = cache->get_result(col, *sum_agg); - auto const count_result = cache->get_result(col, *count_agg); - + auto const sum_agg = make_sum_aggregation(); + auto const count_agg = make_count_aggregation(); + auto const sum_result = cache->get_result(col, *sum_agg); + auto const count_result = cache->get_result(col, *count_agg); + auto const null_removed_sum = [&] { + if (sum_result.null_count() == 0) { return sum_result; } + return column_view{ + sum_result.type(), sum_result.size(), sum_result.head(), nullptr, 0, sum_result.offset()}; + }(); + + // Perform division without any null masks, and generate the null mask for the result later. + // This is because the null mask (if exists) is just needed to be copied from the sum result, + // and copying is faster than running the `bitmask_and` kernel. auto result = - cudf::detail::binary_operation(sum_result, + cudf::detail::binary_operation(null_removed_sum, count_result, binary_operator::DIV, cudf::detail::target_type(input_type, aggregation::MEAN), stream, mr); + // SUM result only has nulls if it is an input aggregation, not intermediate-only aggregation. + if (sum_result.has_nulls()) { + result->set_null_mask(cudf::detail::copy_bitmask(sum_result, stream, mr), + sum_result.null_count()); + } else if (col.has_nulls()) { // SUM aggregation is only intermediate result, thus it is forced + // to be non-nullable + auto [null_mask, null_count] = cudf::detail::valid_if( + count_result.begin(), + count_result.end(), + [] __device__(size_type const count) -> bool { return count > 0; }, + stream, + mr); + if (null_count > 0) { result->set_null_mask(std::move(null_mask), null_count); } + } cache->add_result(col, agg, std::move(result)); } diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index 208135c78d1..c9f11318cb6 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -46,12 +46,15 @@ struct result_column_creator { { } - std::unique_ptr operator()(column_view const& col, aggregation::Kind const& agg) const + std::unique_ptr operator()(column_view const& col, + aggregation::Kind const& agg, + bool force_non_nullable) const { auto const col_type = is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(); - auto const nullable = - agg != aggregation::COUNT_VALID && agg != aggregation::COUNT_ALL && col.has_nulls(); + auto const nullable = force_non_nullable ? false + : agg != aggregation::COUNT_VALID && + agg != aggregation::COUNT_ALL && col.has_nulls(); // TODO: Remove adjusted buffer size workaround once https://github.com/NVIDIA/cccl/issues/6430 // is fixed. Use adjusted buffer size for small data types to ensure atomic operation safety. auto const make_uninitialized_column = [&](data_type d_type, size_type size, mask_state state) { @@ -97,15 +100,22 @@ struct result_column_creator { std::unique_ptr create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(values.num_columns() == static_cast(agg_kinds.size()), + "The number of values columns and size of agg_kinds vector must be the same."); + CUDF_EXPECTS( + values.num_columns() == static_cast(force_non_nullable.size()), + "The number of values columns and size of force_non_nullable vector must be the same."); + + auto const column_creator = result_column_creator{output_size, stream, mr}; std::vector> output_cols; - std::transform(values.begin(), - values.end(), - agg_kinds.begin(), - std::back_inserter(output_cols), - result_column_creator{output_size, stream, mr}); + for (size_t i = 0; i < agg_kinds.size(); i++) { + output_cols.emplace_back( + column_creator(values.column(i), agg_kinds[i], static_cast(force_non_nullable[i]))); + } auto result_table = std::make_unique
(std::move(output_cols)); cudf::detail::initialize_with_identity(result_table->mutable_view(), agg_kinds, stream); return result_table; diff --git a/cpp/src/groupby/hash/output_utils.hpp b/cpp/src/groupby/hash/output_utils.hpp index cec7413ff40..48066441b56 100644 --- a/cpp/src/groupby/hash/output_utils.hpp +++ b/cpp/src/groupby/hash/output_utils.hpp @@ -23,6 +23,8 @@ namespace cudf::groupby::detail::hash { * @param output_size Number of rows in the output table * @param values The values columns to be aggregated * @param agg_kinds The aggregation kinds corresponding to each input column + * @param force_non_nullable A binary values vector indicating if the corresponding result + * will be forced to be non-nullable * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The table containing columns for storing aggregation results @@ -30,6 +32,7 @@ namespace cudf::groupby::detail::hash { std::unique_ptr
create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); From 989e133b112bc8b10ebae2723abcf3b84de9c578 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 25 Nov 2025 21:37:47 -0800 Subject: [PATCH 3/9] Remove unnecessary calls to `visit` and add comments Signed-off-by: Nghia Truong --- .../hash/hash_compound_agg_finalizer.cu | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 38779008461..1391e40cf61 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -124,12 +124,9 @@ void hash_compound_agg_finalizer::visit(cudf::detail::m2_aggregation const& agg) { if (cache->has_result(col, agg)) { return; } - auto const sum_sqr_agg = make_sum_of_squares_aggregation(); - auto const sum_agg = make_sum_aggregation(); - auto const count_agg = make_count_aggregation(); - this->visit(*sum_sqr_agg); - this->visit(*sum_agg); - this->visit(*count_agg); + auto const sum_sqr_agg = make_sum_of_squares_aggregation(); + auto const sum_agg = make_sum_aggregation(); + auto const count_agg = make_count_aggregation(); auto const sum_sqr_result = cache->get_result(col, *sum_sqr_agg); auto const sum_result = cache->get_result(col, *sum_agg); auto const count_result = cache->get_result(col, *count_agg); @@ -142,10 +139,11 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg { if (cache->has_result(col, agg)) { return; } - auto const m2_agg = make_m2_aggregation(); - auto const count_agg = make_count_aggregation(); + auto const m2_agg = make_m2_aggregation(); + // Since M2 is a compound aggregation, we need to "finalize" it using aggregation finalizer's + // "visit" method. this->visit(*dynamic_cast(m2_agg.get())); - this->visit(*count_agg); + auto const count_agg = make_count_aggregation(); auto const m2_result = cache->get_result(col, *m2_agg); auto const count_result = cache->get_result(col, *count_agg); @@ -158,9 +156,10 @@ void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg if (cache->has_result(col, agg)) { return; } auto const m2_agg = make_m2_aggregation(); - auto const count_agg = make_count_aggregation(); + // Since M2 is a compound aggregation, we need to "finalize" it using aggregation finalizer's + // "visit" method. this->visit(*dynamic_cast(m2_agg.get())); - this->visit(*count_agg); + auto const count_agg = make_count_aggregation(); auto const m2_result = cache->get_result(col, *m2_agg); auto const count_result = cache->get_result(col, *count_agg); From 0ae3ad4f9a41e68643d1803cd08c98aaea31f6ee Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 26 Nov 2025 10:00:11 -0800 Subject: [PATCH 4/9] Update benchmark to support multiple aggregations Signed-off-by: Nghia Truong --- cpp/benchmarks/groupby/group_m2_var_std.cpp | 58 ++++++++++++--------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/cpp/benchmarks/groupby/group_m2_var_std.cpp b/cpp/benchmarks/groupby/group_m2_var_std.cpp index 9b153eab824..b9e995e1e9f 100644 --- a/cpp/benchmarks/groupby/group_m2_var_std.cpp +++ b/cpp/benchmarks/groupby/group_m2_var_std.cpp @@ -15,6 +15,7 @@ namespace { template void run_benchmark(nvbench::state& state, cudf::size_type num_rows, + cudf::size_type num_aggs, cudf::size_type value_key_ratio, double null_probability) { @@ -27,30 +28,34 @@ void run_benchmark(nvbench::state& state, return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); }(); - auto const values = [&] { - auto builder = data_profile_builder().cardinality(0).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); - if (null_probability > 0) { - builder.null_probability(null_probability); + auto values_builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + values_builder.null_probability(null_probability); + } else { + values_builder.no_validity(); + } + + std::vector> values_cols; + std::vector requests; + values_cols.reserve(num_aggs); + requests.reserve(num_aggs); + for (cudf::size_type i = 0; i < num_aggs; i++) { + auto values = create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{values_builder}); + auto request = cudf::groupby::aggregation_request{}; + request.values = values->view(); + if constexpr (Agg == cudf::aggregation::Kind::M2) { + request.aggregations.push_back(cudf::make_m2_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::VARIANCE) { + request.aggregations.push_back(cudf::make_variance_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::STD) { + request.aggregations.push_back(cudf::make_std_aggregation()); } else { - builder.no_validity(); + throw std::runtime_error("Unsupported aggregation kind."); } - return create_random_column( - cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); - }(); - - // Vector of 1 request - std::vector requests(1); - requests.back().values = values->view(); - if constexpr (Agg == cudf::aggregation::Kind::M2) { - requests.back().aggregations.push_back(cudf::make_m2_aggregation()); - } else if constexpr (Agg == cudf::aggregation::Kind::VARIANCE) { - requests.back().aggregations.push_back( - cudf::make_variance_aggregation()); - } else if constexpr (Agg == cudf::aggregation::Kind::STD) { - requests.back().aggregations.push_back(cudf::make_std_aggregation()); - } else { - throw std::runtime_error("Unsupported aggregation kind."); + values_cols.emplace_back(std::move(values)); + requests.emplace_back(std::move(request)); } auto const mem_stats_logger = cudf::memory_stats_logger(); @@ -75,8 +80,8 @@ void bench_groupby_m2_var_std(nvbench::state& state, auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); auto const num_rows = static_cast(state.get_int64("num_rows")); auto const null_probability = state.get_float64("null_probability"); - - run_benchmark(state, num_rows, value_key_ratio, null_probability); + auto const num_aggs = static_cast(state.get_int64("num_aggs")); + run_benchmark(state, num_rows, num_aggs, value_key_ratio, null_probability); } using Types = nvbench::type_list; @@ -87,5 +92,6 @@ using AggKinds = nvbench::enum_type_list Date: Wed, 26 Nov 2025 10:01:16 -0800 Subject: [PATCH 5/9] Fix style Signed-off-by: Nghia Truong --- cpp/src/groupby/hash/compute_global_memory_aggs.cuh | 4 ++-- cpp/src/groupby/hash/hash_compound_agg_finalizer.cu | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 6f4bf9aca6d..31954b17c9d 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -95,7 +95,7 @@ std::pair, rmm::device_uvector> compute_aggs_d force_non_nullable, stream, mr); - auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); + auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(int64_t{0}), @@ -129,7 +129,7 @@ std::pair, rmm::device_uvector> compute_aggs_s auto const d_values = table_device_view::create(values, stream); auto agg_results = create_results_table(num_rows, values, h_agg_kinds, force_non_nullable, stream, mr); - auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); + auto d_results_ptr = mutable_table_device_view::create(*agg_results, stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 1391e40cf61..8e6a5d83d76 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -155,7 +155,7 @@ void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg { if (cache->has_result(col, agg)) { return; } - auto const m2_agg = make_m2_aggregation(); + auto const m2_agg = make_m2_aggregation(); // Since M2 is a compound aggregation, we need to "finalize" it using aggregation finalizer's // "visit" method. this->visit(*dynamic_cast(m2_agg.get())); From 15a465efbf50dc9247097792c6ec284396e05048 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 26 Nov 2025 11:54:49 -0800 Subject: [PATCH 6/9] Change `int` type to `int8_t` Signed-off-by: Nghia Truong --- cpp/src/groupby/hash/compute_global_memory_aggs.cu | 2 +- cpp/src/groupby/hash/compute_global_memory_aggs.cuh | 6 +++--- cpp/src/groupby/hash/compute_global_memory_aggs.hpp | 2 +- cpp/src/groupby/hash/compute_global_memory_aggs_null.cu | 2 +- cpp/src/groupby/hash/extract_single_pass_aggs.cpp | 4 ++-- cpp/src/groupby/hash/extract_single_pass_aggs.hpp | 2 +- cpp/src/groupby/hash/output_utils.cu | 2 +- cpp/src/groupby/hash/output_utils.hpp | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index fbb18cabe7c..e9cb1192ea8 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -13,7 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmask, global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 31954b17c9d..934dbb5e8cf 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -72,7 +72,7 @@ std::pair, rmm::device_uvector> compute_aggs_d SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -121,7 +121,7 @@ std::pair, rmm::device_uvector> compute_aggs_s SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -160,7 +160,7 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index f6edaad0528..8a53100a651 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -20,7 +20,7 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index a7a5a787984..438836bbd08 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -13,7 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmas nullable_global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp index 2b3c53313ca..0aae34a039c 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.cpp @@ -97,14 +97,14 @@ class groupby_simple_aggregations_collector final std::tuple, std::vector>, - std::vector, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; - std::vector force_non_nullable; + std::vector force_non_nullable; auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); bool has_compound_aggs = false; diff --git a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp index d7208475ae6..144aa719ce7 100644 --- a/cpp/src/groupby/hash/extract_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/extract_single_pass_aggs.hpp @@ -40,7 +40,7 @@ namespace cudf::groupby::detail::hash { std::tuple, std::vector>, - std::vector, + std::vector, bool> extract_single_pass_aggs(host_span requests, rmm::cuda_stream_view stream); diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index c9f11318cb6..2c56c7d529f 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -100,7 +100,7 @@ struct result_column_creator { std::unique_ptr
create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/cpp/src/groupby/hash/output_utils.hpp b/cpp/src/groupby/hash/output_utils.hpp index 48066441b56..27d1de2d905 100644 --- a/cpp/src/groupby/hash/output_utils.hpp +++ b/cpp/src/groupby/hash/output_utils.hpp @@ -32,7 +32,7 @@ namespace cudf::groupby::detail::hash { std::unique_ptr
create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, - host_span force_non_nullable, + host_span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); From 43fff53596032473db9692297733c35e59f60619 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 26 Nov 2025 13:03:41 -0800 Subject: [PATCH 7/9] Rewrite nullable condition for the output column Signed-off-by: Nghia Truong --- cpp/src/groupby/hash/output_utils.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index 2c56c7d529f..63c92f1f709 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -52,9 +52,8 @@ struct result_column_creator { { auto const col_type = is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(); - auto const nullable = force_non_nullable ? false - : agg != aggregation::COUNT_VALID && - agg != aggregation::COUNT_ALL && col.has_nulls(); + auto const nullable = !force_non_nullable && agg != aggregation::COUNT_VALID && + agg != aggregation::COUNT_ALL && col.has_nulls(); // TODO: Remove adjusted buffer size workaround once https://github.com/NVIDIA/cccl/issues/6430 // is fixed. Use adjusted buffer size for small data types to ensure atomic operation safety. auto const make_uninitialized_column = [&](data_type d_type, size_type size, mask_state state) { From dc633e582fc07df6531c6a87509cb4e3a0c6f793 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 26 Nov 2025 18:21:18 -0800 Subject: [PATCH 8/9] Use `std::span` as function parameter Signed-off-by: Nghia Truong --- cpp/src/groupby/hash/compute_global_memory_aggs.cu | 2 +- cpp/src/groupby/hash/compute_global_memory_aggs.cuh | 6 +++--- cpp/src/groupby/hash/compute_global_memory_aggs.hpp | 2 +- cpp/src/groupby/hash/compute_global_memory_aggs_null.cu | 2 +- cpp/src/groupby/hash/output_utils.cu | 2 +- cpp/src/groupby/hash/output_utils.hpp | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index e9cb1192ea8..4f31f5eda75 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -13,7 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmask, global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 934dbb5e8cf..fa3a7981f75 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -72,7 +72,7 @@ std::pair, rmm::device_uvector> compute_aggs_d SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -121,7 +121,7 @@ std::pair, rmm::device_uvector> compute_aggs_s SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -160,7 +160,7 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 8a53100a651..2f2c3ff7f51 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -20,7 +20,7 @@ std::pair, rmm::device_uvector> compute_global SetType const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 438836bbd08..168cfe0800a 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -13,7 +13,7 @@ compute_global_memory_aggs(bitmask_type const* row_bitmas nullable_global_set_t const& key_set, host_span h_agg_kinds, device_span d_agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index 63c92f1f709..4d5f473149c 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -99,7 +99,7 @@ struct result_column_creator { std::unique_ptr
create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/cpp/src/groupby/hash/output_utils.hpp b/cpp/src/groupby/hash/output_utils.hpp index 27d1de2d905..1bb84c930cd 100644 --- a/cpp/src/groupby/hash/output_utils.hpp +++ b/cpp/src/groupby/hash/output_utils.hpp @@ -32,7 +32,7 @@ namespace cudf::groupby::detail::hash { std::unique_ptr
create_results_table(size_type output_size, table_view const& values, host_span agg_kinds, - host_span force_non_nullable, + std::span force_non_nullable, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); From 794c48be00d5e9a86cb545df5e11f9af5237d9c3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sat, 29 Nov 2025 21:53:12 -0800 Subject: [PATCH 9/9] Fix null check when gathering argmin/argmax Signed-off-by: Nghia Truong --- .../groupby/hash/hash_compound_agg_finalizer.cu | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 8e6a5d83d76..c2ac6027c4a 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -48,14 +48,13 @@ auto hash_compound_agg_finalizer::gather_argminmax(aggregation const& agg) static_cast(arg_result.data()), nullptr, 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result.nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); + auto gather_argminmax = cudf::detail::gather( + table_view{{col}}, + null_removed_map, + col.nullable() ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); return std::move(gather_argminmax->release()[0]); }