Correct sparse handling for Aggregation clauses

IvoDD · IvoDD · commit 10da688bf572 · 2025-09-16T12:26:28.000+03:00
- Makes Aggregation clauses like `Mean` and `Count` respect input column
  sparsity
- Fixes `CopyToBufferTask` to respect sparsity for arrow
diff --git a/cpp/arcticdb/arrow/array_from_block.hpp b/cpp/arcticdb/arrow/array_from_block.hpp
@@ -16,6 +16,10 @@ namespace arcticdb {
 inline std::optional<sparrow::validity_bitmap> create_validity_bitmap(size_t offset, const Column& column, size_t bitmap_size) {
     if(column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
         auto &bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
+        util::check(
+            bitmap_buffer.blocks().size() == 1,
+            "Expected a single block bitmap extra buffer but got {} blocks",
+            bitmap_buffer.blocks().size());
         return sparrow::validity_bitmap{reinterpret_cast<uint8_t *>(bitmap_buffer.block(0)->release()), bitmap_size};
     } else {
         return std::nullopt;
diff --git a/cpp/arcticdb/processing/test/test_clause.cpp b/cpp/arcticdb/processing/test/test_clause.cpp
@@ -103,12 +103,7 @@ namespace aggregation_test
         ASSERT_EQ(dt, column.type().data_type());
         for(std::size_t idx = 0u; idx < ugv; ++idx) {
             if constexpr (std::is_floating_point_v<T>) {
-                const T val = column.scalar_at<T>(idx).value();
-                if (std::isnan(val)) {
-                    ASSERT_TRUE(std::isnan(f(idx)));
-                } else {
-                    ASSERT_EQ(f(idx), val);
-                }
+                ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
             } else {
                 ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
             }
@@ -175,16 +170,16 @@ TEST(Clause, AggregationSparseColumn)
         return idx % 2 == 0 ? 450 + 10 * idx : 0;
     });
     check_column<int64_t>(*segments[0], "min_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t> {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(idx)} : std::nullopt;
+        return idx % 2 == 0 ? std::make_optional<int64_t>(idx) : std::nullopt;
     });
     check_column<int64_t>(*segments[0], "max_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t>  {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(90 + idx)} : std::nullopt;
+        return idx % 2 == 0 ? std::make_optional<int64_t>(90 + idx) : std::nullopt;
     });
-    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> double {
-        return idx % 2 == 0 ? 45 + idx : std::numeric_limits<double>::quiet_NaN();
+    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> std::optional<double> {
+        return idx % 2 == 0 ? std::make_optional<double>(45 + idx) : std::nullopt;
     });
-    check_column<uint64_t>(*segments[0], "count_int", unique_grouping_values, [](size_t idx) -> uint64_t {
-        return idx % 2 == 0 ? 10 : 0;
+    check_column<uint64_t>(*segments[0], "count_int", unique_grouping_values, [](size_t idx) -> std::optional<uint64_t> {
+        return idx % 2 == 0 ? std::make_optional<uint64_t>(10) : std::nullopt;
     });
 }
 
diff --git a/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp b/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp
@@ -83,7 +83,7 @@ class AggregationResult : public ::testing::TestWithParam<DataType> {
         } if constexpr (is_bool_type(InputTypeTag::data_type())) {
             return std::array{ 2 / 3.0, 0.0, 1.0, 1 / 3.0};
         } else if constexpr (is_empty_type(InputTypeTag::data_type())) {
-            return std::array{0.0, 0.0, 0.0};
+            return std::array<double, 0>{};
         }
     }
 
@@ -139,7 +139,11 @@ TEST_P(AggregationResult, Mean) {
             ASSERT_EQ(result.field(0).type(), make_scalar_type(OutputDataTypeTag::data_type()));
             ASSERT_EQ(result.field(0).name(), "output");
             const Column& aggregated_column = result.column(0);
-            ASSERT_EQ(aggregated_column.row_count(), group_count);
+            if constexpr (!is_empty_type(TypeTag::data_type)) {
+                ASSERT_EQ(aggregated_column.row_count(), group_count);
+            } else {
+                ASSERT_EQ(aggregated_column.row_count(), 0);
+            }
             constexpr static std::array expected = get_expected_result_mean<InputDataTypeTag>();
             Column::for_each_enumerated<OutputDataTypeTag>(aggregated_column, [&](const auto& row) {
                 ASSERT_EQ(row.value(), expected[row.idx()]);
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.cpp b/cpp/arcticdb/processing/unsorted_aggregation.cpp
@@ -426,6 +426,7 @@ DataType MeanAggregatorData::get_output_data_type() {
 
 void MeanAggregatorData::aggregate(const ColumnWithStrings& input_column, const std::vector<size_t>& groups, size_t unique_values) {
     fractions_.resize(unique_values);
+    sparse_map_.resize(unique_values);
     details::visit_type(input_column.column_->type().data_type(), [&input_column, &groups, this] (auto col_tag) {
         using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
         if constexpr (is_sequence_type(col_type_info::data_type)) {
@@ -439,10 +440,12 @@ void MeanAggregatorData::aggregate(const ColumnWithStrings& input_column, const
                 if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) {
                     fraction.numerator_ += static_cast<double>(enumerating_it.value());
                     ++fraction.denominator_;
+                    sparse_map_.set(groups[enumerating_it.idx()]);
                 }
             } else {
                 fraction.numerator_ += static_cast<double>(enumerating_it.value());
                 ++fraction.denominator_;
+                sparse_map_.set(groups[enumerating_it.idx()]);
             }
         });
     });
@@ -452,25 +455,23 @@ SegmentInMemory MeanAggregatorData::finalize(const ColumnName& output_column_nam
     SegmentInMemory res;
     if(!fractions_.empty()) {
         fractions_.resize(unique_values);
-        auto col = std::make_shared<Column>(make_scalar_type(get_output_data_type()), fractions_.size(), AllocationType::PRESIZED, Sparsity::NOT_PERMITTED);
-        auto column_data = col->data();
-        // TODO: Empty type needs more though. Maybe we should emit a column of empty value and leave it to the
-        //   NullValueReducer to handle it. As of this PR (04.07.2025) the empty type is feature flagged and not used so
-        //   we don't worry too much about optimizing it.
+        sparse_map_.resize(unique_values);
+        auto col = create_output_column(make_scalar_type(get_output_data_type()), std::move(sparse_map_), unique_values);
+        // TODO: Empty type needs more thought. Currently we emit a fully sparse column which will be populated by
+        // `copy_frame_data_to_buffer` but this might not be the right approach. As of this PR (11.09.2025) the empty
+        // type is feature flagged and not used so we don't worry too much about optimizing it.
         if (data_type_ && *data_type_ == DataType::EMPTYVAL) [[unlikely]] {
-            std::fill_n(column_data.begin<ScalarTagType<DataTypeTag<DataType::FLOAT64>>>(), fractions_.size(), 0.f);
+            auto empty_bitset = util::BitSet(unique_values);
+            col->set_sparse_map(std::move(empty_bitset));
         } else {
             details::visit_type(col->type().data_type(), [&, this]<typename TypeTag>(TypeTag) {
-               using OutputDataTypeTag = std::conditional_t<is_time_type(TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
-               using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
-               std::transform(fractions_.cbegin(), fractions_.cend(),
-                              column_data.begin<OutputTypeDescriptor>(),
-                              [](const auto &fraction) {
-                                 return static_cast<typename OutputDataTypeTag::raw_type>(fraction.to_double());
-                              });
-           });
+                using OutputDataTypeTag = std::conditional_t<is_time_type(TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
+                using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
+                Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
+                    row.value() = static_cast<typename OutputDataTypeTag::raw_type>(fractions_[row.idx()].to_double());
+                });
+            });
         }
-        col->set_row_data(fractions_.size() - 1);
         res.add_column(scalar_field(get_output_data_type(), output_column_name.value), std::move(col));
     }
     return res;
@@ -490,17 +491,20 @@ std::optional<Value> MeanAggregatorData::get_default_value() {
 
 void CountAggregatorData::aggregate(const ColumnWithStrings& input_column, const std::vector<size_t>& groups, size_t unique_values) {
     aggregated_.resize(unique_values);
+    sparse_map_.resize(unique_values);
     details::visit_type(input_column.column_->type().data_type(), [&input_column, &groups, this] (auto col_tag) {
         using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
         Column::for_each_enumerated<typename col_type_info::TDT>(*input_column.column_, [&groups, this](auto enumerating_it) {
             if constexpr (is_floating_point_type(col_type_info::data_type)) {
                 if (ARCTICDB_LIKELY(!std::isnan(enumerating_it.value()))) {
                     auto& val = aggregated_[groups[enumerating_it.idx()]];
                     ++val;
+                    sparse_map_.set(groups[enumerating_it.idx()]);
                 }
             } else {
                 auto& val = aggregated_[groups[enumerating_it.idx()]];
                 ++val;
+                sparse_map_.set(groups[enumerating_it.idx()]);
             }
         });
     });
@@ -510,11 +514,19 @@ SegmentInMemory CountAggregatorData::finalize(const ColumnName& output_column_na
     SegmentInMemory res;
     if(!aggregated_.empty()) {
         aggregated_.resize(unique_values);
-        auto pos = res.add_column(scalar_field(DataType::UINT64, output_column_name.value), unique_values, AllocationType::PRESIZED);
-        auto& column = res.column(pos);
-        auto ptr = reinterpret_cast<uint64_t*>(column.ptr());
-        column.set_row_data(unique_values - 1);
-        memcpy(ptr, aggregated_.data(), sizeof(uint64_t)*unique_values);
+        sparse_map_.resize(unique_values);
+        auto col = create_output_column(make_scalar_type(get_output_data_type()), std::move(sparse_map_), unique_values);
+        if (!col->opt_sparse_map().has_value()) {
+            // If all values are set we use memcpy for efficiency
+            auto ptr = reinterpret_cast<uint64_t*>(col->ptr());
+            memcpy(ptr, aggregated_.data(), sizeof(uint64_t)*unique_values);
+        } else {
+            using OutputTypeDescriptor = typename ScalarTypeInfo<DataTypeTag<DataType::UINT64>>::TDT;
+            Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
+                row.value() = aggregated_[row.idx()];
+            });
+        }
+        res.add_column(scalar_field(get_output_data_type(), output_column_name.value), std::move(col));
     }
     return res;
 }
@@ -538,6 +550,7 @@ void FirstAggregatorData::aggregate(const ColumnWithStrings& input_column, const
             using GlobalTypeDescriptorTag =  typename OutputType<GlobalInputType>::type;
             using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
             aggregated_.resize(sizeof(GlobalRawType)* unique_values);
+            sparse_map_.resize(unique_values);
             auto col_data = input_column.column_->data();
             auto out_ptr = reinterpret_cast<GlobalRawType*>(aggregated_.data());
             details::visit_type(input_column.column_->type().data_type(), [this, &groups, &out_ptr, &col_data] (auto col_tag) {
@@ -553,11 +566,13 @@ void FirstAggregatorData::aggregate(const ColumnWithStrings& input_column, const
                             if (is_first_group_el || std::isnan(static_cast<ColumnType>(val))) {
                                 groups_cache_.insert(groups[groups_pos]);
                                 val = GlobalRawType(*ptr);
+                                sparse_map_.set(groups[groups_pos]);
                             }
                         } else {
                             if (is_first_group_el) {
                                 groups_cache_.insert(groups[groups_pos]);
                                 val = GlobalRawType(*ptr);
+                                sparse_map_.set(groups[groups_pos]);
                             }
                         }
                     }
@@ -571,12 +586,20 @@ SegmentInMemory FirstAggregatorData::finalize(const ColumnName& output_column_na
     SegmentInMemory res;
     if(!aggregated_.empty()) {
         details::visit_type(*data_type_, [this, &res, &output_column_name, unique_values] (auto col_tag) {
-            using RawType = typename decltype(col_tag)::DataTypeTag::raw_type;
+            using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
+            using RawType = typename col_type_info::RawType;
             aggregated_.resize(sizeof(RawType)* unique_values);
-            auto col = std::make_shared<Column>(make_scalar_type(data_type_.value()), unique_values, AllocationType::PRESIZED, Sparsity::NOT_PERMITTED);
-            memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            sparse_map_.resize(unique_values);
+            auto col = create_output_column(make_scalar_type(data_type_.value()), std::move(sparse_map_), unique_values);
+            if (!col->opt_sparse_map().has_value()) {
+                memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            } else {
+                const std::span<const RawType> group_values{reinterpret_cast<const RawType*>(aggregated_.data()), aggregated_.size() / sizeof(RawType)};
+                Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
+                    row.value() = group_values[row.idx()];
+                });
+            }
             res.add_column(scalar_field(data_type_.value(), output_column_name.value), col);
-            col->set_row_data(unique_values - 1);
         });
     }
     return res;
@@ -601,6 +624,7 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
             using GlobalTypeDescriptorTag =  typename OutputType<GlobalInputType>::type;
             using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
             aggregated_.resize(sizeof(GlobalRawType)* unique_values);
+            sparse_map_.resize(unique_values);
             auto col_data = input_column.column_->data();
             auto out_ptr = reinterpret_cast<GlobalRawType*>(aggregated_.data());
             details::visit_type(input_column.column_->type().data_type(), [&groups, &out_ptr, &col_data, this] (auto col_tag) {
@@ -617,9 +641,11 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
                             if (is_first_group_el || !std::isnan(static_cast<ColumnType>(curr))) {
                                 groups_cache_.insert(groups[groups_pos]);
                                 val = curr;
+                                sparse_map_.set(groups[groups_pos]);
                             }
                         } else {
                             val = GlobalRawType(*ptr);
+                            sparse_map_.set(groups[groups_pos]);
                         }
                     }
                 }
@@ -631,13 +657,21 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
 SegmentInMemory LastAggregatorData::finalize(const ColumnName& output_column_name, bool, size_t unique_values) {
     SegmentInMemory res;
     if(!aggregated_.empty()) {
-        details::visit_type(*data_type_, [that=this, &res, &output_column_name, unique_values] (auto col_tag) {
-            using RawType = typename decltype(col_tag)::DataTypeTag::raw_type;
-            that->aggregated_.resize(sizeof(RawType)* unique_values);
-            auto col = std::make_shared<Column>(make_scalar_type(that->data_type_.value()), unique_values, AllocationType::PRESIZED, Sparsity::NOT_PERMITTED);
-            memcpy(col->ptr(), that->aggregated_.data(), that->aggregated_.size());
-            res.add_column(scalar_field(that->data_type_.value(), output_column_name.value), col);
-            col->set_row_data(unique_values - 1);
+        details::visit_type(*data_type_, [&res, &output_column_name, unique_values, this] (auto col_tag) {
+            using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
+            using RawType = typename col_type_info::RawType;
+            aggregated_.resize(sizeof(RawType)* unique_values);
+            sparse_map_.resize(unique_values);
+            auto col = create_output_column(make_scalar_type(data_type_.value()), std::move(sparse_map_), unique_values);
+            if (!col->opt_sparse_map().has_value()) {
+                memcpy(col->ptr(), aggregated_.data(), aggregated_.size());
+            } else {
+                const std::span<const RawType> group_values{reinterpret_cast<const RawType*>(aggregated_.data()), aggregated_.size() / sizeof(RawType)};
+                Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
+                    row.value() = group_values[row.idx()];
+                });
+            }
+            res.add_column(scalar_field(data_type_.value(), output_column_name.value), col);
         });
     }
     return res;
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.hpp b/cpp/arcticdb/processing/unsorted_aggregation.hpp
@@ -132,6 +132,7 @@ class MeanAggregatorData : private AggregatorDataBase
     };
     std::vector<Fraction> fractions_;
     std::optional<DataType> data_type_;
+    util::BitMagic sparse_map_;
 };
 
 class CountAggregatorData : private AggregatorDataBase
@@ -149,6 +150,7 @@ class CountAggregatorData : private AggregatorDataBase
 private:
 
     std::vector<uint64_t> aggregated_;
+    util::BitMagic sparse_map_;
 };
 
 class FirstAggregatorData : private AggregatorDataBase
@@ -168,6 +170,7 @@ class FirstAggregatorData : private AggregatorDataBase
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 class LastAggregatorData : private AggregatorDataBase
@@ -187,6 +190,7 @@ class LastAggregatorData : private AggregatorDataBase
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 template <class AggregatorData>
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py