[10026766759] Correct sparse handling for Aggregation clauses (#2644)

IvoDD · web-flow · commit 803c91e1551a · 2025-09-17T14:12:39.000+03:00
#### Reference Issues/PRs Monday ref: 10026766759 #### What does this implement or fix? - Makes Aggregation clauses like `Mean` and `Count` respect input column sparsity - Fixes `CopyToBufferTask` to respect sparsity for arrow - Adds a similar test for resampling - Adds an xfail test for monday issue: 10029194063 #### Any other comments? Commits can be reviewed individually #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/arrow/array_from_block.hpp b/cpp/arcticdb/arrow/array_from_block.hpp
@@ -19,6 +19,11 @@ inline std::optional<sparrow::validity_bitmap> create_validity_bitmap(
 ) {
     if (column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
         auto& bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
+        util::check(
+                bitmap_buffer.blocks().size() == 1,
+                "Expected a single block bitmap extra buffer but got {} blocks",
+                bitmap_buffer.blocks().size()
+        );
         return sparrow::validity_bitmap{reinterpret_cast<uint8_t*>(bitmap_buffer.block(0)->release()), bitmap_size};
     } else {
         return std::nullopt;
diff --git a/cpp/arcticdb/processing/test/test_clause.cpp b/cpp/arcticdb/processing/test/test_clause.cpp
@@ -112,12 +112,7 @@ void check_column(arcticdb::SegmentInMemory segment, std::string_view column_nam
     ASSERT_EQ(dt, column.type().data_type());
     for (std::size_t idx = 0u; idx < ugv; ++idx) {
         if constexpr (std::is_floating_point_v<T>) {
-            const T val = column.scalar_at<T>(idx).value();
-            if (std::isnan(val)) {
-                ASSERT_TRUE(std::isnan(f(idx)));
-            } else {
-                ASSERT_EQ(f(idx), val);
-            }
+            ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
         } else {
             ASSERT_EQ(f(idx), column.scalar_at<T>(idx));
         }
@@ -192,17 +187,22 @@ TEST(Clause, AggregationSparseColumn) {
         return idx % 2 == 0 ? 450 + 10 * idx : 0;
     });
     check_column<int64_t>(*segments[0], "min_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t> {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(idx)} : std::nullopt;
+        return idx % 2 == 0 ? std::make_optional<int64_t>(idx) : std::nullopt;
     });
     check_column<int64_t>(*segments[0], "max_int", unique_grouping_values, [](size_t idx) -> std::optional<int64_t> {
-        return idx % 2 == 0 ? std::optional{static_cast<int64_t>(90 + idx)} : std::nullopt;
-    });
-    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> double {
-        return idx % 2 == 0 ? 45 + idx : std::numeric_limits<double>::quiet_NaN();
+        return idx % 2 == 0 ? std::make_optional<int64_t>(90 + idx) : std::nullopt;
     });
-    check_column<uint64_t>(*segments[0], "count_int", unique_grouping_values, [](size_t idx) -> uint64_t {
-        return idx % 2 == 0 ? 10 : 0;
+    check_column<double>(*segments[0], "mean_int", unique_grouping_values, [](size_t idx) -> std::optional<double> {
+        return idx % 2 == 0 ? std::make_optional<double>(45 + idx) : std::nullopt;
     });
+    check_column<uint64_t>(
+            *segments[0],
+            "count_int",
+            unique_grouping_values,
+            [](size_t idx) -> std::optional<uint64_t> {
+                return idx % 2 == 0 ? std::make_optional<uint64_t>(10) : std::nullopt;
+            }
+    );
 }
 
 TEST(Clause, AggregationSparseGroupby) {
diff --git a/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp b/cpp/arcticdb/processing/test/test_unsorted_aggregation.cpp
@@ -87,7 +87,7 @@ class AggregationResult : public ::testing::TestWithParam<DataType> {
         if constexpr (is_bool_type(InputTypeTag::data_type())) {
             return std::array{2 / 3.0, 0.0, 1.0, 1 / 3.0};
         } else if constexpr (is_empty_type(InputTypeTag::data_type())) {
-            return std::array{0.0, 0.0, 0.0};
+            return std::array<double, 0>{};
         }
     }
 
@@ -148,7 +148,11 @@ TEST_P(AggregationResult, Mean) {
             ASSERT_EQ(result.field(0).type(), make_scalar_type(OutputDataTypeTag::data_type()));
             ASSERT_EQ(result.field(0).name(), "output");
             const Column& aggregated_column = result.column(0);
-            ASSERT_EQ(aggregated_column.row_count(), group_count);
+            if constexpr (!is_empty_type(TypeTag::data_type)) {
+                ASSERT_EQ(aggregated_column.row_count(), group_count);
+            } else {
+                ASSERT_EQ(aggregated_column.row_count(), 0);
+            }
             constexpr static std::array expected = get_expected_result_mean<InputDataTypeTag>();
             Column::for_each_enumerated<OutputDataTypeTag>(aggregated_column, [&](const auto& row) {
                 ASSERT_EQ(row.value(), expected[row.idx()]);
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.cpp b/cpp/arcticdb/processing/unsorted_aggregation.cpp
diff --git a/cpp/arcticdb/processing/unsorted_aggregation.hpp b/cpp/arcticdb/processing/unsorted_aggregation.hpp
@@ -118,6 +118,7 @@ class MeanAggregatorData : private AggregatorDataBase {
     };
     std::vector<Fraction> fractions_;
     std::optional<DataType> data_type_;
+    util::BitMagic sparse_map_;
 };
 
 class CountAggregatorData : private AggregatorDataBase {
@@ -131,6 +132,7 @@ class CountAggregatorData : private AggregatorDataBase {
 
   private:
     std::vector<uint64_t> aggregated_;
+    util::BitMagic sparse_map_;
 };
 
 class FirstAggregatorData : private AggregatorDataBase {
@@ -146,6 +148,7 @@ class FirstAggregatorData : private AggregatorDataBase {
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 class LastAggregatorData : private AggregatorDataBase {
@@ -161,6 +164,7 @@ class LastAggregatorData : private AggregatorDataBase {
     std::optional<DataType> data_type_;
 
     std::unordered_set<size_t> groups_cache_;
+    util::BitMagic sparse_map_;
 };
 
 template<class AggregatorData>
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -1470,6 +1470,25 @@ static void check_incompletes_index_ranges_dont_overlap(
     }
 }
 
+void init_sparse_dst_column_before_copy(
+        Column& dst_column, size_t offset, size_t num_rows, size_t dst_rawtype_size, OutputFormat output_format,
+        const std::optional<util::BitSet>& src_sparse_map, const std::optional<Value>& default_value
+) {
+    if (output_format != OutputFormat::ARROW || default_value.has_value()) {
+        auto total_size = dst_rawtype_size * num_rows;
+        auto dst_ptr = dst_column.bytes_at(offset, total_size);
+        dst_column.type().visit_tag([&](auto dst_desc_tag) {
+            util::initialize<decltype(dst_desc_tag)>(dst_ptr, total_size, default_value);
+        });
+    } else {
+        if (src_sparse_map.has_value()) {
+            create_dense_bitmap(offset, src_sparse_map.value(), dst_column, AllocationType::DETACHABLE);
+        } else {
+            create_dense_bitmap_all_zeros(offset, num_rows, dst_column, AllocationType::DETACHABLE);
+        }
+    }
+}
+
 void copy_frame_data_to_buffer(
         SegmentInMemory& destination, size_t target_index, SegmentInMemory& source, size_t source_index,
         const RowRange& row_range, DecodePathData shared_data, std::any& handler_data, OutputFormat output_format,
@@ -1510,10 +1529,9 @@ void copy_frame_data_to_buffer(
         };
         handler->convert_type(src_column, dst_column, mapping, shared_data, handler_data, source.string_pool_ptr());
     } else if (is_empty_type(src_column.type().data_type())) {
-        // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
-        dst_column.type().visit_tag([&](auto dst_desc_tag) {
-            util::initialize<decltype(dst_desc_tag)>(dst_ptr, total_size, default_value);
-        });
+        init_sparse_dst_column_before_copy(
+                dst_column, offset, num_rows, dst_rawtype_size, output_format, std::nullopt, default_value
+        );
         // Do not use src_column.is_sparse() here, as that misses columns that are dense, but have fewer than num_rows
         // values
     } else if (src_column.opt_sparse_map().has_value() &&
@@ -1524,8 +1542,15 @@ void copy_frame_data_to_buffer(
             using dst_type_info = ScalarTypeInfo<decltype(dst_tag)>;
             typename dst_type_info::RawType* typed_dst_ptr =
                     reinterpret_cast<typename dst_type_info::RawType*>(dst_ptr);
-            // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
-            util::initialize<typename dst_type_info::TDT>(dst_ptr, num_rows * dst_rawtype_size, default_value);
+            init_sparse_dst_column_before_copy(
+                    dst_column,
+                    offset,
+                    num_rows,
+                    dst_rawtype_size,
+                    output_format,
+                    src_column.opt_sparse_map(),
+                    default_value
+            );
             details::visit_type(src_column.type().data_type(), [&](auto src_tag) {
                 using src_type_info = ScalarTypeInfo<decltype(src_tag)>;
                 Column::for_each_enumerated<typename src_type_info::TDT>(
@@ -1548,8 +1573,15 @@ void copy_frame_data_to_buffer(
                     dst_ptr += row_count * sizeof(SourceType);
                 }
             } else {
-                // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
-                util::initialize<SourceTDT>(dst_ptr, num_rows * dst_rawtype_size, default_value);
+                init_sparse_dst_column_before_copy(
+                        dst_column,
+                        offset,
+                        num_rows,
+                        dst_rawtype_size,
+                        output_format,
+                        src_column.opt_sparse_map(),
+                        default_value
+                );
                 SourceType* typed_dst_ptr = reinterpret_cast<SourceType*>(dst_ptr);
                 Column::for_each_enumerated<SourceTDT>(src_column, [&](const auto& row) {
                     typed_dst_ptr[row.idx()] = row.value();
@@ -1580,16 +1612,21 @@ void copy_frame_data_to_buffer(
         // one with float32 dtype and one with dtype:
         // common_type(common_type(uint16, int8), float32) = common_type(int32, float32) = float64
         details::visit_type(dst_column.type().data_type(), [&](auto dest_desc_tag) {
-            using dst_type_info = ScalarTypeInfo<decltype(dest_desc_tag)>;
             using DestinationRawType = typename decltype(dest_desc_tag)::DataTypeTag::raw_type;
             auto typed_dst_ptr = reinterpret_cast<DestinationRawType*>(dst_ptr);
             details::visit_type(src_column.type().data_type(), [&](auto src_desc_tag) {
                 using source_type_info = ScalarTypeInfo<decltype(src_desc_tag)>;
                 if constexpr (std::is_arithmetic_v<typename source_type_info::RawType> &&
                               std::is_arithmetic_v<DestinationRawType>) {
                     if (src_column.is_sparse()) {
-                        util::initialize<typename dst_type_info::TDT>(
-                                dst_ptr, num_rows * dst_rawtype_size, default_value
+                        init_sparse_dst_column_before_copy(
+                                dst_column,
+                                offset,
+                                num_rows,
+                                dst_rawtype_size,
+                                output_format,
+                                src_column.opt_sparse_map(),
+                                default_value
                         );
                         Column::for_each_enumerated<typename source_type_info::TDT>(src_column, [&](const auto& row) {
                             typed_dst_ptr[row.idx()] = row.value();
diff --git a/python/tests/unit/arcticdb/version_store/test_append.py b/python/tests/unit/arcticdb/version_store/test_append.py
@@ -750,3 +750,15 @@ def test_append_series_with_different_row_range_index_name(lmdb_version_store_dy
     # See Monday 9797097831, it would be best to require that index names are always matching. This is the case for
     # datetime index because it's a physical column. It's a potentially breaking change.
     assert lib.read("sym").data.index.name == "index_name_2"
+
+
+@pytest.mark.xfail(reason="Wrong normalization metadata update. Monday ref: 10029194063")
+def test_append_no_columns(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    to_write = pd.DataFrame({"col": [1, 2, 3]}, index=pd.date_range(pd.Timestamp(2025, 1, 1), periods=3))
+    to_append = pd.DataFrame({}, index=pd.date_range(pd.Timestamp(2025, 1, 4), periods=3))
+    lib.write("sym", to_write)
+    lib.append("sym", to_append)
+    expected = pd.concat([to_write, to_append])
+    result = lib.read("sym").data
+    assert_frame_equal(result, expected)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -757,12 +757,63 @@ def test_aggregation_empty_slices(lmdb_version_store_dynamic_schema_v1):
     table = lib.read(sym, query_builder=q).data
     # sum_col is correctly filled with 0s instead of nulls
     assert pc.count(table.column("sum_col"), mode="only_null").as_py() == 0
-    # TODO: Fix the TODOs in `CopyToBufferTask` to make num_nulls=5 as expected
-    # For this test it so happens that one present and one missing value end up in the same bucket.
-    # Copying then default initializes the missing values instead of setting the validity bitmap.
-    # assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 5
-    # assert pc.count(table.column("min_col"), mode="only_null").as_py() == 5
-    # assert pc.count(table.column("max_col"), mode="only_null").as_py() == 5
-    # assert pc.count(table.column("count_col"), mode="only_null").as_py() == 5
+    assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 5
+    assert pc.count(table.column("min_col"), mode="only_null").as_py() == 5
+    assert pc.count(table.column("max_col"), mode="only_null").as_py() == 5
+    assert pc.count(table.column("count_col"), mode="only_null").as_py() == 5
+    expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+    assert_frame_equal_with_arrow(table, expected)
+
+
+def test_resample_empty_slices(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+
+    def gen_df(start, num_rows, with_columns=True):
+        data = {}
+        if with_columns:
+            data = {
+                "mean_col": np.arange(start, start + num_rows, dtype=np.float64),
+                "sum_col": np.arange(start, start + num_rows, dtype=np.float64),
+                "min_col": np.arange(start, start + num_rows, dtype=np.float64),
+                "max_col": np.arange(start, start + num_rows, dtype=np.float64),
+                "count_col": np.arange(start, start + num_rows, dtype=np.float64),
+            }
+        index = pd.date_range(pd.Timestamp(2025, 1, start), periods=num_rows)
+        return pd.DataFrame(data, index=index)
+
+    slices = [
+        gen_df(1, 3),
+        gen_df(4, 2, False),  # We expect an entirely missing slice 4th-5th
+        gen_df(6, 3),
+        gen_df(9, 5, False),  # We expect two missing slices 10th-11th and 12th-13th
+        gen_df(14, 2),
+        gen_df(16, 2, False),  # We expect one missing slice 16th-17th
+        # TODO: If we don't finish with an append with columns our normalization metadata will be broken
+        gen_df(18, 1),
+    ]
+    for df_slice in slices:
+        lib.append(sym, df_slice, write_if_missing=True)
+
+    q = QueryBuilder()
+    q.resample("2d").agg(
+        {
+            "mean_col": "mean",
+            "sum_col": "sum",
+            "min_col": "min",
+            "max_col": "max",
+            "count_col": "count",
+        }
+    )
+
+    table = lib.read(sym, query_builder=q).data
+    # sum_col is correctly filled with 0s instead of nulls
+    assert pc.count(table.column("sum_col"), mode="only_null").as_py() == 0
+    # We expect 4 entirely empty buckets
+    assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("min_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("max_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("count_col"), mode="only_null").as_py() == 4
     expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
     assert_frame_equal_with_arrow(table, expected)