Correct sparse handling for Aggregation clauses

IvoDD · IvoDD · commit ac4e12b5e7bf · 2025-09-17T12:12:33.000+03:00
- Makes Aggregation clauses like `Mean` and `Count` respect input column
  sparsity
- Fixes `CopyToBufferTask` to respect sparsity for arrow
diff --git a/cpp/arcticdb/arrow/array_from_block.hpp b/cpp/arcticdb/arrow/array_from_block.hpp
@@ -19,6 +19,11 @@ inline std::optional<sparrow::validity_bitmap> create_validity_bitmap(
 ) {
     if (column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
         auto& bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
+        util::check(
+                bitmap_buffer.blocks().size() == 1,
+                "Expected a single block bitmap extra buffer but got {} blocks",
+                bitmap_buffer.blocks().size()
+        );
         return sparrow::validity_bitmap{reinterpret_cast<uint8_t*>(bitmap_buffer.block(0)->release()), bitmap_size};
     } else {
         return std::nullopt;
diff --git a/cpp/arcticdb/processing/test/test_clause.cpp b/cpp/arcticdb/processing/test/test_clause.cpp
@@ -112,12 +112,7 @@ void check_column(arcticdb::SegmentInMemory segment, std::string_view column_nam
     ASSERT_EQ(dt, column.type().data_type());
     for (std::size_t idx = 0u; idx < ugv; ++idx) {
         if constexpr (std::is_floating_point_v<T>) {
-            const T val = column.scalar_at<T>(idx).value();
-            if (std::isnan(val)) {
-                ASSERT_TRUE(std::isnan(f(idx)));
-            } else {
-                ASSERT_EQ(f(idx), val);
-            }
+            ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
         } else {
             ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
         }
@@ -192,17 +187,22 @@ TEST(Clause, AggregationSparseColumn) {
         return idx % 2 == 0 ? 450 + 10 * idx : 0;
     });
     check_column<int64_t>(*segments[0], "min_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t> {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(idx)} : std::nullopt;
+        return idx % 2 == 0 ? std::make_optional<int64_t>(idx) : std::nullopt;
     });
     check_column<int64_t>(*segments[0], "max_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t> {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(90 + idx)} : std::nullopt;
-    });
-    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> double {
-        return idx % 2 == 0 ? 45 + idx : std::numeric_limits<double>::quiet_NaN();
+        return idx % 2 == 0 ? std::make_optional<int64_t>(90 + idx) : std::nullopt;
     });
-    check_column<uint64_t>(*segments[0], "count_int", unique_grouping_values, [](size_t idx) -> uint64_t {
-        return idx % 2 == 0 ? 10 : 0;
+    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> std::optional<double> {
+        return idx % 2 == 0 ? std::make_optional<double>(45 + idx) : std::nullopt;
     });
+    check_column<uint64_t>(
+            *segments[0],
+            "count_int",
+            unique_grouping_values,
+            [](size_t idx) -> std::optional<uint64_t> {
+                return idx % 2 == 0 ? std::make_optional<uint64_t>(10) : std::nullopt;
+            }
+    );
 }
 
 TEST(Clause, AggregationSparseGroupby) {
diff --git a/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp b/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp
@@ -87,7 +87,7 @@ class AggregationResult : public ::testing::TestWithParam<DataType> {
         if constexpr (is_bool_type(InputTypeTag::data_type())) {
             return std::array{2 / 3.0, 0.0, 1.0, 1 / 3.0};
         } else if constexpr (is_empty_type(InputTypeTag::data_type())) {
-            return std::array{0.0, 0.0, 0.0};
+            return std::array<double, 0>{};
         }
     }
 
@@ -148,7 +148,11 @@ TEST_P(AggregationResult, Mean) {
             ASSERT_EQ(result.field(0).type(), make_scalar_type(OutputDataTypeTag::data_type()));
             ASSERT_EQ(result.field(0).name(), "output");
             const Column& aggregated_column = result.column(0);
-            ASSERT_EQ(aggregated_column.row_count(), group_count);
+            if constexpr (!is_empty_type(TypeTag::data_type)) {
+                ASSERT_EQ(aggregated_column.row_count(), group_count);
+            } else {
+                ASSERT_EQ(aggregated_column.row_count(), 0);
+            }
             constexpr static std::array expected = get_expected_result_mean<InputDataTypeTag>();
             Column::for_each_enumerated<OutputDataTypeTag>(aggregated_column, [&](const auto& row) {
                 ASSERT_EQ(row.value(), expected[row.idx()]);
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.cpp b/cpp/arcticdb/processing/unsorted_aggregation.cpp
@@ -429,6 +429,7 @@ void MeanAggregatorData::aggregate(
         const ColumnWithStrings& input_column, const std::vector<size_t>& groups, size_t unique_values
 ) {
     fractions_.resize(unique_values);
+    sparse_map_.resize(unique_values);
     details::visit_type(input_column.column_->type().data_type(), [&input_column, &groups, this](auto col_tag) {
         using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
         if constexpr (is_sequence_type(col_type_info::data_type)) {
@@ -444,10 +445,12 @@ void MeanAggregatorData::aggregate(
                         if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) {
                             fraction.numerator_ += static_cast<double>(enumerating_it.value());
                             ++fraction.denominator_;
+                            sparse_map_.set(groups[enumerating_it.idx()]);
                         }
                     } else {
                         fraction.numerator_ += static_cast<double>(enumerating_it.value());
                         ++fraction.denominator_;
+                        sparse_map_.set(groups[enumerating_it.idx()]);
                     }
                 }
         );
@@ -458,34 +461,25 @@ SegmentInMemory MeanAggregatorData::finalize(const ColumnName& output_column_nam
     SegmentInMemory res;
     if (!fractions_.empty()) {
         fractions_.resize(unique_values);
-        auto col = std::make_shared<Column>(
-                make_scalar_type(get_output_data_type()),
-                fractions_.size(),
-                AllocationType::PRESIZED,
-                Sparsity::NOT_PERMITTED
-        );
-        auto column_data = col->data();
-        // TODO: Empty type needs more though. Maybe we should emit a column of empty value and leave it to the
-        //   NullValueReducer to handle it. As of this PR (04.07.2025) the empty type is feature flagged and not used so
-        //   we don't worry too much about optimizing it.
+        sparse_map_.resize(unique_values);
+        auto col =
+                create_output_column(make_scalar_type(get_output_data_type()), std::move(sparse_map_), unique_values);
+        // TODO: Empty type needs more thought. Currently we emit a fully sparse column which will be populated by
+        // `copy_frame_data_to_buffer` but this might not be the right approach. As of this PR (11.09.2025) the empty
+        // type is feature flagged and not used so we don't worry too much about optimizing it.
         if (data_type_ && *data_type_ == DataType::EMPTYVAL) [[unlikely]] {
-            std::fill_n(column_data.begin<ScalarTagType<DataTypeTag<DataType::FLOAT64>>>(), fractions_.size(), 0.f);
+            auto empty_bitset = util::BitSet(unique_values);
+            col->set_sparse_map(std::move(empty_bitset));
         } else {
             details::visit_type(col->type().data_type(), [&, this]<typename TypeTag>(TypeTag) {
                 using OutputDataTypeTag =
                         std::conditional_t<is_time_type(TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
                 using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
-                std::transform(
-                        fractions_.cbegin(),
-                        fractions_.cend(),
-                        column_data.begin<OutputTypeDescriptor>(),
-                        [](const auto& fraction) {
-                            return static_cast<typename OutputDataTypeTag::raw_type>(fraction.to_double());
-                        }
-                );
+                Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
+                    row.value() = static_cast<typename OutputDataTypeTag::raw_type>(fractions_[row.idx()].to_double());
+                });
             });
         }
-        col->set_row_data(fractions_.size() - 1);
         res.add_column(scalar_field(get_output_data_type(), output_column_name.value), std::move(col));
     }
     return res;
@@ -505,6 +499,7 @@ void CountAggregatorData::aggregate(
         const ColumnWithStrings& input_column, const std::vector<size_t>& groups, size_t unique_values
 ) {
     aggregated_.resize(unique_values);
+    sparse_map_.resize(unique_values);
     details::visit_type(input_column.column_->type().data_type(), [&input_column, &groups, this](auto col_tag) {
         using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
         Column::for_each_enumerated<typename col_type_info::TDT>(
@@ -514,10 +509,12 @@ void CountAggregatorData::aggregate(
                         if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) {
                             auto& val = aggregated_[groups[enumerating_it.idx()]];
                             ++val;
+                            sparse_map_.set(groups[enumerating_it.idx()]);
                         }
                     } else {
                         auto& val = aggregated_[groups[enumerating_it.idx()]];
                         ++val;
+                        sparse_map_.set(groups[enumerating_it.idx()]);
                     }
                 }
         );
@@ -528,13 +525,20 @@ SegmentInMemory CountAggregatorData::finalize(const ColumnName& output_column_na
     SegmentInMemory res;
     if (!aggregated_.empty()) {
         aggregated_.resize(unique_values);
-        auto pos = res.add_column(
-                scalar_field(DataType::UINT64, output_column_name.value), unique_values, AllocationType::PRESIZED
-        );
-        auto& column = res.column(pos);
-        auto ptr = reinterpret_cast<uint64_t*>(column.ptr());
-        column.set_row_data(unique_values - 1);
-        memcpy(ptr, aggregated_.data(), sizeof(uint64_t) * unique_values);
+        sparse_map_.resize(unique_values);
+        auto col =
+                create_output_column(make_scalar_type(get_output_data_type()), std::move(sparse_map_), unique_values);
+        if (!col->opt_sparse_map().has_value()) {
+            // If all values are set we use memcpy for efficiency
+            auto ptr = reinterpret_cast<uint64_t*>(col->ptr());
+            memcpy(ptr, aggregated_.data(), sizeof(uint64_t) * unique_values);
+        } else {
+            using OutputTypeDescriptor = typename ScalarTypeInfo<DataTypeTag<DataType::UINT64>>::TDT;
+            Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
+                row.value() = aggregated_[row.idx()];
+            });
+        }
+        res.add_column(scalar_field(get_output_data_type(), output_column_name.value), std::move(col));
     }
     return res;
 }
@@ -556,6 +560,7 @@ void FirstAggregatorData::aggregate(
             using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
             using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
             aggregated_.resize(sizeof(GlobalRawType) * unique_values);
+            sparse_map_.resize(unique_values);
             auto col_data = input_column.column_->data();
             auto out_ptr = reinterpret_cast<GlobalRawType*>(aggregated_.data());
             details::visit_type(
@@ -575,11 +580,13 @@ void FirstAggregatorData::aggregate(
                                     if (is_first_group_el || std::isnan(static_cast<ColumnType>(val))) {
                                         groups_cache_.insert(groups[groups_pos]);
                                         val = GlobalRawType(*ptr);
+                                        sparse_map_.set(groups[groups_pos]);
                                     }
                                 } else {
                                     if (is_first_group_el) {
                                         groups_cache_.insert(groups[groups_pos]);
                                         val = GlobalRawType(*ptr);
+                                        sparse_map_.set(groups[groups_pos]);
                                     }
                                 }
                             }
@@ -594,17 +601,23 @@ SegmentInMemory FirstAggregatorData::finalize(const ColumnName& output_column_na
     SegmentInMemory res;
     if (!aggregated_.empty()) {
         details::visit_type(*data_type_, [this, &res, &output_column_name, unique_values](auto col_tag) {
-            using RawType = typename decltype(col_tag)::DataTypeTag::raw_type;
+            using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
+            using RawType = typename col_type_info::RawType;
             aggregated_.resize(sizeof(RawType) * unique_values);
-            auto col = std::make_shared<Column>(
-                    make_scalar_type(data_type_.value()),
-                    unique_values,
-                    AllocationType::PRESIZED,
-                    Sparsity::NOT_PERMITTED
-            );
-            memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            sparse_map_.resize(unique_values);
+            auto col =
+                    create_output_column(make_scalar_type(data_type_.value()), std::move(sparse_map_), unique_values);
+            if (!col->opt_sparse_map().has_value()) {
+                memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            } else {
+                const std::span<const RawType> group_values{
+                        reinterpret_cast<const RawType*>(aggregated_.data()), aggregated_.size() / sizeof(RawType)
+                };
+                Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
+                    row.value() = group_values[row.idx()];
+                });
+            }
             res.add_column(scalar_field(data_type_.value(), output_column_name.value), col);
-            col->set_row_data(unique_values - 1);
         });
     }
     return res;
@@ -627,6 +640,7 @@ void LastAggregatorData::aggregate(
             using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
             using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
             aggregated_.resize(sizeof(GlobalRawType) * unique_values);
+            sparse_map_.resize(unique_values);
             auto col_data = input_column.column_->data();
             auto out_ptr = reinterpret_cast<GlobalRawType*>(aggregated_.data());
             details::visit_type(
@@ -648,9 +662,11 @@ void LastAggregatorData::aggregate(
                                     if (is_first_group_el || !std::isnan(static_cast<ColumnType>(curr))) {
                                         groups_cache_.insert(groups[groups_pos]);
                                         val = curr;
+                                        sparse_map_.set(groups[groups_pos]);
                                     }
                                 } else {
                                     val = GlobalRawType(*ptr);
+                                    sparse_map_.set(groups[groups_pos]);
                                 }
                             }
                         }
@@ -663,18 +679,24 @@ void LastAggregatorData::aggregate(
 SegmentInMemory LastAggregatorData::finalize(const ColumnName& output_column_name, bool, size_t unique_values) {
     SegmentInMemory res;
     if (!aggregated_.empty()) {
-        details::visit_type(*data_type_, [that = this, &res, &output_column_name, unique_values](auto col_tag) {
-            using RawType = typename decltype(col_tag)::DataTypeTag::raw_type;
-            that->aggregated_.resize(sizeof(RawType) * unique_values);
-            auto col = std::make_shared<Column>(
-                    make_scalar_type(that->data_type_.value()),
-                    unique_values,
-                    AllocationType::PRESIZED,
-                    Sparsity::NOT_PERMITTED
-            );
-            memcpy(col->ptr(), that->aggregated_.data(), that->aggregated_.size());
-            res.add_column(scalar_field(that->data_type_.value(), output_column_name.value), col);
-            col->set_row_data(unique_values - 1);
+        details::visit_type(*data_type_, [&res, &output_column_name, unique_values, this](auto col_tag) {
+            using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
+            using RawType = typename col_type_info::RawType;
+            aggregated_.resize(sizeof(RawType) * unique_values);
+            sparse_map_.resize(unique_values);
+            auto col =
+                    create_output_column(make_scalar_type(data_type_.value()), std::move(sparse_map_), unique_values);
+            if (!col->opt_sparse_map().has_value()) {
+                memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            } else {
+                const std::span<const RawType> group_values{
+                        reinterpret_cast<const RawType*>(aggregated_.data()), aggregated_.size() / sizeof(RawType)
+                };
+                Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
+                    row.value() = group_values[row.idx()];
+                });
+            }
+            res.add_column(scalar_field(data_type_.value(), output_column_name.value), col);
         });
     }
     return res;
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.hpp b/cpp/arcticdb/processing/unsorted_aggregation.hpp
@@ -118,6 +118,7 @@ class MeanAggregatorData : private AggregatorDataBase {
     };
     std::vector<Fraction> fractions_;
     std::optional<DataType> data_type_;
+    util::BitMagic sparse_map_;
 };
 
 class CountAggregatorData : private AggregatorDataBase {
@@ -131,6 +132,7 @@ class CountAggregatorData : private AggregatorDataBase {
 
   private:
     std::vector<uint64_t> aggregated_;
+    util::BitMagic sparse_map_;
 };
 
 class FirstAggregatorData : private AggregatorDataBase {
@@ -146,6 +148,7 @@ class FirstAggregatorData : private AggregatorDataBase {
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 class LastAggregatorData : private AggregatorDataBase {
@@ -161,6 +164,7 @@ class LastAggregatorData : private AggregatorDataBase {
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 template<class AggregatorData>
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py