[9898177828] Use default_value in NullReducer for OutputFormat::ARROW (#2633)

IvoDD · web-flow · commit ffe58bc89c63 · 2025-09-15T13:18:25.000+03:00
#### Reference Issues/PRs Monday ref: 9898177828 #### What does this implement or fix? When doing aggregation we explicitly default `sum=0` for slices with no underlying values. For arrow this means to not set the validity bitmap in this case and to default initialize the values. The change includes: - Small refactor of `NullReducer` to extract common parts between `reduce` and `finalize` in `backfill_up_to_frame_offset` - Modification of `Column::default_initialize` to work across several blocks - Removes broken `memset` method from `ChunkedBuffer` and instead provides a new `util::initialize` method which can initialize a `ChunkedBuffer` across blocks #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/column_store/chunked_buffer.hpp b/cpp/arcticdb/column_store/chunked_buffer.hpp
@@ -311,7 +311,7 @@ class ChunkedBufferImpl {
 
     uint8_t* bytes_at(size_t pos_bytes, size_t required) {
         auto [block, pos, _] = block_and_offset(pos_bytes);
-        util::check(pos + required <= block->bytes(), "Block overflow, position {} is greater than block capacity {}", pos, block->bytes());
+        util::check(pos + required <= block->bytes(), "Block overflow, position {} is greater than block capacity {}", pos + required, block->bytes());
         return &(*block)[pos];
     }
 
@@ -366,21 +366,21 @@ class ChunkedBufferImpl {
         }
     }
 
-    void memset_buffer(size_t offset, size_t bytes, char value) {
-        auto [block, pos, block_index] = block_and_offset(offset);
-        while(bytes > 0) {
-            const auto size_to_write = block->bytes() - pos;
-            memset(block->data() + pos, size_to_write, value);
-            bytes -= size_to_write;
-            if(bytes > 0) {
-                ++block_index;
-                if(block_index == blocks_.size())
-                    return;
-
-                block = blocks_[block_index];
-                pos = 0;
-            }
+    // Returns a vector of continuous buffers, each designated by a pointer and size
+    // Similar to `bytes_at` but will work if the requested range spans multiple continuous blocks.
+    std::vector<std::pair<uint8_t*, size_t>> byte_blocks_at(size_t pos_bytes, size_t required_bytes) {
+        check_bytes(pos_bytes, required_bytes);
+        std::vector<std::pair<uint8_t*, size_t>> result;
+        auto [block, pos, block_index] = block_and_offset(pos_bytes);
+        while(required_bytes > 0) {
+            block = blocks_[block_index];
+            const auto size_to_write = std::min(required_bytes, block->bytes() - pos);
+            result.push_back({block->data() + pos, size_to_write});
+            required_bytes -= size_to_write;
+            ++block_index;
+            pos = 0;
         }
+        return result;
     }
 
     template<typename T>
diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp
@@ -670,9 +670,7 @@ void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ens
             if (ensure_alloc) {
                 data_.ensure<uint8_t>(bytes);
             }
-            // This doesn't work if we default_initialize bytes which span across multiple blocks.
-            auto type_ptr = reinterpret_cast<RawType *>(data_.bytes_at(start_pos * sizeof(RawType), bytes));
-            util::initialize<T>(reinterpret_cast<uint8_t*>(type_ptr), bytes, default_value);
+            util::initialize<T>(data_.buffer(), start_pos * sizeof(RawType), bytes, default_value);
             if (ensure_alloc) {
                 data_.commit();
             }
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -763,38 +763,38 @@ class NullValueReducer {
         return context_row.slice_and_key().slice_.row_range.first;
     }
 
-    void backfill_all_zero_validity_bitmaps_up_to(std::optional<size_t> up_to_block_offset) {
+    void backfill_all_zero_validity_bitmaps_up_to(size_t up_to_block_offset) {
         // Fills up all validity bitmaps with zeros from `column_block_idx_` until reaching `up_to_block_offset`.
-        // If `up_to_block_offset` is `std::nullopt` then fills up until the end of the column.
         const auto& block_offsets = column_.block_offsets();
-        util::check(!up_to_block_offset.has_value() || up_to_block_offset.value() <= block_offsets.back(), "up_to_block_offset outside of range");
-        for (; column_block_idx_ < block_offsets.size() - 1; ++column_block_idx_) {
-            if (up_to_block_offset.has_value() && block_offsets.at(column_block_idx_) >= up_to_block_offset.value()) {
-                break;
-            }
+        util::check(up_to_block_offset <= block_offsets.back(), "up_to_block_offset {} outside of range {}", up_to_block_offset, block_offsets.back());
+        for (; column_block_idx_ < block_offsets.size() - 1 && block_offsets.at(column_block_idx_) < up_to_block_offset; ++column_block_idx_) {
             auto rows = (block_offsets.at(column_block_idx_ + 1) - block_offsets.at(column_block_idx_)) / type_bytes_;
             create_dense_bitmap_all_zeros(block_offsets.at(column_block_idx_), rows, column_, AllocationType::DETACHABLE);
         }
     }
 
-    void reduce(PipelineContextRow &context_row){
-        auto &slice_and_key = context_row.slice_and_key();
-        auto sz_to_advance = slice_and_key.slice_.row_range.diff();
-        auto current_pos = context_row.slice_and_key().slice_.row_range.first;
-        if (current_pos != pos_) {
-            const auto num_rows = current_pos - pos_;
+    void backfill_up_to_frame_offset(size_t up_to) {
+        if (pos_ != up_to) {
+            const auto num_rows = up_to - pos_;
             const auto start_row = pos_ - frame_.offset();
-            const auto end_row = current_pos - frame_.offset();
+            const auto end_row = up_to - frame_.offset();
             if (const std::shared_ptr<TypeHandler>& handler = get_type_handler(output_format_, column_.type()); handler) {
                 handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
-            } else if (output_format_ != OutputFormat::ARROW) {
+            } else if (output_format_ != OutputFormat::ARROW || default_value_.has_value()) {
                 // Arrow does not care what values are in the main buffer where the validity bitmap is zero
                 column_.default_initialize_rows(start_row, num_rows, false, default_value_);
             }
-            if (output_format_ == OutputFormat::ARROW) {
+            if (output_format_ == OutputFormat::ARROW && !default_value_.has_value()) {
                 backfill_all_zero_validity_bitmaps_up_to(end_row * type_bytes_);
             }
         }
+    }
+
+    void reduce(PipelineContextRow &context_row){
+        auto &slice_and_key = context_row.slice_and_key();
+        auto sz_to_advance = slice_and_key.slice_.row_range.diff();
+        auto current_pos = context_row.slice_and_key().slice_.row_range.first;
+        backfill_up_to_frame_offset(current_pos);
         pos_ = current_pos + sz_to_advance;
         if (output_format_ == OutputFormat::ARROW) {
             ++column_block_idx_;
@@ -804,20 +804,8 @@ class NullValueReducer {
     void finalize() {
         const auto total_rows = frame_.row_count();
         const auto end =  frame_.offset() + total_rows;
-        if(pos_ != end) {
-            util::check(pos_ < end, "Overflow in finalize {} > {}", pos_, end);
-            const auto num_rows = end - pos_;
-            const auto start_row = pos_ - frame_.offset();
-            if (const std::shared_ptr<TypeHandler>& handler = get_type_handler(output_format_, column_.type()); handler) {
-                handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
-            } else if (output_format_ != OutputFormat::ARROW) {
-                // Arrow does not care what values are in the main buffer where the validity bitmap is zero
-                column_.default_initialize_rows(start_row, num_rows, false, default_value_);
-            }
-            if (output_format_ == OutputFormat::ARROW) {
-                backfill_all_zero_validity_bitmaps_up_to(std::nullopt);
-            }
-        }
+        util::check(pos_ <= end, "Overflow in finalize {} > {}", pos_, end);
+        backfill_up_to_frame_offset(end);
     }
 };
 
diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp
@@ -86,39 +86,6 @@ void default_initialize(uint8_t* data, const size_t bytes) {
     }
 }
 
-template <typename TagType>
-requires util::instantiation_of<TagType, TypeDescriptorTag>
-void default_initialize(ChunkedBuffer& buffer, size_t offset, const size_t bytes, DecodePathData shared_data, std::any& handler_data) {
-    using RawType = typename TagType::DataTypeTag::raw_type;
-    const auto num_rows ARCTICDB_UNUSED = bytes / sizeof(RawType);
-    constexpr auto type = static_cast<TypeDescriptor>(TagType{});
-    constexpr auto data_type = type.data_type();
-    ColumnData column_data{&buffer, type};
-    auto pos = column_data.begin<TagType, IteratorType::REGULAR, IteratorDensity::DENSE, false>();
-    std::advance(pos, offset);
-    //auto end = column_data.begin<TagType, IteratorType::REGULAR, IteratorDensity::DENSE, false>();
-    if constexpr (is_sequence_type(data_type)) {
-        std::fill_n(pos, num_rows, not_a_string());
-    } else if constexpr (is_floating_point_type(data_type)) {
-        std::fill_n(pos, num_rows, std::numeric_limits<RawType>::quiet_NaN());
-    } else if constexpr (is_time_type(data_type)) {
-        std::fill_n(pos, num_rows, NaT);
-    } else if constexpr (is_integer_type(data_type) || is_bool_type(data_type)) {
-        buffer.memset_buffer(offset, bytes, 0);
-    } else {
-        constexpr auto type_descriptor = TagType::type_descriptor();
-        if (const std::shared_ptr<TypeHandler>& handler = arcticdb::TypeHandlerRegistry::instance()->get_handler(type_descriptor);handler) {
-            handler->default_initialize(buffer, offset, bytes, shared_data, handler_data);
-        } else {
-            internal::raise<ErrorCode::E_INVALID_ARGUMENT>(
-                "Default initialization for {} is not implemented.",
-                type_descriptor
-            );
-        }
-    }
-}
-
-
 /// Initialize a buffer either using a custom default value or using a predefined default value for the type
 /// @param[in] default_value Variant holding either a value of the raw type for the type tag or std::monostate
 template <typename TagType>
@@ -137,6 +104,15 @@ void initialize(uint8_t* data, const size_t bytes, const std::optional<Value>& d
     }
 }
 
+template <typename TagType>
+requires util::instantiation_of<TagType, TypeDescriptorTag>
+void initialize(ChunkedBuffer& buffer, size_t offset, size_t bytes, const std::optional<Value>& default_value) {
+    auto blocks = buffer.byte_blocks_at(offset, bytes);
+    for (auto [data, size] : blocks) {
+        initialize<TagType>(data, size, default_value);
+    }
+}
+
 [[nodiscard]] util::BitSet scan_object_type_to_sparse(
     const PyObject* const* ptr,
     size_t rows_to_write);
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -667,3 +667,44 @@ def test_project_dynamic_schema_complex(lmdb_version_store_dynamic_schema_v1):
     table = lib.read(sym, query_builder=q).data
     expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
     assert_frame_equal_with_arrow(table, expected)
+
+
+def test_aggregation_empty_slices(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+    df_1 = pd.DataFrame({
+        "group_col": [chr(ord("a")+i) for i in range(5)],
+        "mean_col": np.arange(0, 5, dtype=np.float64),
+        "sum_col": np.arange(0, 5, dtype=np.float64),
+        "min_col": np.arange(0, 5, dtype=np.float64),
+        "max_col": np.arange(0, 5, dtype=np.float64),
+        "count_col": np.arange(0, 5, dtype=np.float64),
+    })
+    df_2 = pd.DataFrame({
+        "group_col": [chr(ord("a")+i+10) for i in range(5)],
+    })
+    lib.write(sym, df_1, dynamic_strings=True)
+    lib.append(sym, df_2, dynamic_strings=True)
+
+    q = QueryBuilder()
+    q.groupby("group_col").agg({
+        "mean_col": "mean",
+        "sum_col": "sum",
+        "min_col": "min",
+        "max_col": "max",
+        "count_col": "count",
+    })
+
+    table = lib.read(sym, query_builder=q).data
+    # sum_col is correctly filled with 0s instead of nulls
+    assert pc.count(table.column("sum_col"), mode="only_null").as_py() == 0
+    # TODO: Fix the TODOs in `CopyToBufferTask` to make num_nulls=5 as expected
+    # For this test it so happens that one present and one missing value end up in the same bucket.
+    # Copying then default initializes the missing values instead of setting the validity bitmap.
+    # assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 5
+    # assert pc.count(table.column("min_col"), mode="only_null").as_py() == 5
+    # assert pc.count(table.column("max_col"), mode="only_null").as_py() == 5
+    # assert pc.count(table.column("count_col"), mode="only_null").as_py() == 5
+    expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+    assert_frame_equal_with_arrow(table, expected)

Original file line number	Diff line number	Diff line change
`@@ -670,9 +670,7 @@ void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ens`
`670`	`670`	`if (ensure_alloc) {`
`671`	`671`	`data_.ensure<uint8_t>(bytes);`
`672`	`672`	`}`
`673`		`- // This doesn't work if we default_initialize bytes which span across multiple blocks.`
`674`		`- auto type_ptr = reinterpret_cast<RawType >(data_.bytes_at(start_pos sizeof(RawType), bytes));`
`675`		`- util::initialize<T>(reinterpret_cast<uint8_t*>(type_ptr), bytes, default_value);`
	`673`	`+ util::initialize<T>(data_.buffer(), start_pos * sizeof(RawType), bytes, default_value);`
`676`	`674`	`if (ensure_alloc) {`
`677`	`675`	`data_.commit();`
`678`	`676`	`}`