[9898131742] Fix arrow projection with dynamic schema (#2630)

IvoDD · web-flow · commit 1b13f924d38c · 2025-09-15T09:44:51.000+03:00
#### Reference Issues/PRs Monday ref: 9898131742 #### What does this implement or fix? This PR modifies `NullReducer` code to not rely on the slice index and by preserving a `column_block_offset_` state avoids an unneeded `log(n)` search for the offset. #### Any other comments? `NullReducer` code was assuming that `len(slice_and_keys) = len(row_slices_per_column)` when using `dynamic_schema=True`. That is not true if we use projections. E.g. for the following projection our slicing would look like: ``` Given: TD key 1: index A 1 1 2 2 TD key 2: index A B 3 3 1 4 4 2 TD key 3: index B 5 3 6 4 And we do a projection like `q.apply("C", q["A"] + q["B"])` our slicing would look like: Slice 1: TD key 1 Slice 2: TD key 2 Slice 3: index C 3 4 4 6 Slice 4: TD key 3 ``` #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -732,6 +732,7 @@ class NullValueReducer {
     std::shared_ptr<PipelineContext> context_;
     SegmentInMemory frame_;
     size_t pos_;
+    size_t column_block_idx_;
     DecodePathData shared_data_;
     std::any& handler_data_;
     const OutputFormat output_format_;
@@ -751,6 +752,7 @@ class NullValueReducer {
             context_(context),
             frame_(std::move(frame)),
             pos_(frame_.offset()),
+            column_block_idx_(0),
             shared_data_(std::move(shared_data)),
             handler_data_(handler_data),
             output_format_(output_format),
@@ -761,18 +763,17 @@ class NullValueReducer {
         return context_row.slice_and_key().slice_.row_range.first;
     }
 
-    void backfill_all_zero_validity_bitmaps(size_t offset_bytes_start, size_t offset_bytes_end_idx) {
-        // Explanation: offset_bytes_start and offset_bytes_end should both be elements of block_offsets by
-        // construction. We must add an all zeros validity bitmap for each row-slice read from storage where this
-        // column was missing, in order to correctly populate the Arrow record-batches for the output
+    void backfill_all_zero_validity_bitmaps_up_to(std::optional<size_t> up_to_block_offset) {
+        // Fills up all validity bitmaps with zeros from `column_block_idx_` until reaching `up_to_block_offset`.
+        // If `up_to_block_offset` is `std::nullopt` then fills up until the end of the column.
         const auto& block_offsets = column_.block_offsets();
-        auto start_it = std::ranges::lower_bound(block_offsets, offset_bytes_start);
-        util::check(start_it != block_offsets.cend() && *start_it == offset_bytes_start,
-                    "NullValueReducer: Failed to find offset_bytes_start {} in block_offsets {}",
-                    offset_bytes_start, block_offsets);
-        for (auto idx = static_cast<size_t>(std::distance(block_offsets.begin(), start_it)); idx < offset_bytes_end_idx; ++idx) {
-            auto rows = (block_offsets.at(idx + 1) - block_offsets.at(idx)) / type_bytes_;
-            create_dense_bitmap_all_zeros(block_offsets.at(idx), rows, column_, AllocationType::DETACHABLE);
+        util::check(!up_to_block_offset.has_value() || up_to_block_offset.value() <= block_offsets.back(), "up_to_block_offset outside of range");
+        for (; column_block_idx_ < block_offsets.size() - 1; ++column_block_idx_) {
+            if (up_to_block_offset.has_value() && block_offsets.at(column_block_idx_) >= up_to_block_offset.value()) {
+                break;
+            }
+            auto rows = (block_offsets.at(column_block_idx_ + 1) - block_offsets.at(column_block_idx_)) / type_bytes_;
+            create_dense_bitmap_all_zeros(block_offsets.at(column_block_idx_), rows, column_, AllocationType::DETACHABLE);
         }
     }
 
@@ -783,18 +784,20 @@ class NullValueReducer {
         if (current_pos != pos_) {
             const auto num_rows = current_pos - pos_;
             const auto start_row = pos_ - frame_.offset();
+            const auto end_row = current_pos - frame_.offset();
             if (const std::shared_ptr<TypeHandler>& handler = get_type_handler(output_format_, column_.type()); handler) {
                 handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
             } else if (output_format_ != OutputFormat::ARROW) {
                 // Arrow does not care what values are in the main buffer where the validity bitmap is zero
                 column_.default_initialize_rows(start_row, num_rows, false, default_value_);
             }
             if (output_format_ == OutputFormat::ARROW) {
-                backfill_all_zero_validity_bitmaps(start_row * type_bytes_, context_row.index());
+                backfill_all_zero_validity_bitmaps_up_to(end_row * type_bytes_);
             }
-            pos_ = current_pos + sz_to_advance;
-        } else {
-            pos_ += sz_to_advance;
+        }
+        pos_ = current_pos + sz_to_advance;
+        if (output_format_ == OutputFormat::ARROW) {
+            ++column_block_idx_;
         }
     }
 
@@ -812,7 +815,7 @@ class NullValueReducer {
                 column_.default_initialize_rows(start_row, num_rows, false, default_value_);
             }
             if (output_format_ == OutputFormat::ARROW) {
-                backfill_all_zero_validity_bitmaps(start_row * type_bytes_, column_.block_offsets().size() - 1);
+                backfill_all_zero_validity_bitmaps_up_to(std::nullopt);
             }
         }
     }
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -1381,6 +1381,7 @@ void copy_frame_data_to_buffer(
         const ColumnMapping mapping{src_column.type(), dst_column.type(), destination.field(target_index), type_size, num_rows, row_range.first, offset, total_size, target_index};
         handler->convert_type(src_column, dst_column, mapping, shared_data, handler_data, source.string_pool_ptr());
     } else if (is_empty_type(src_column.type().data_type())) {
+        // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
         dst_column.type().visit_tag([&](auto dst_desc_tag) {
             util::initialize<decltype(dst_desc_tag)>(dst_ptr, total_size, default_value);
         });
@@ -1389,6 +1390,7 @@ void copy_frame_data_to_buffer(
         details::visit_type(dst_column.type().data_type(), [&](auto dst_tag) {
             using dst_type_info = ScalarTypeInfo<decltype(dst_tag)>;
             typename dst_type_info::RawType* typed_dst_ptr = reinterpret_cast<typename dst_type_info::RawType*>(dst_ptr);
+            // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
             util::initialize<typename dst_type_info::TDT>(dst_ptr, num_rows * dst_rawtype_size, default_value);
             details::visit_type(src_column.type().data_type(), [&](auto src_tag) {
                 using src_type_info = ScalarTypeInfo<decltype(src_tag)>;
@@ -1408,6 +1410,7 @@ void copy_frame_data_to_buffer(
                     dst_ptr += row_count * sizeof(SourceType);
                 }
             } else {
+                // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
                 util::initialize<SourceTDT>(dst_ptr, num_rows * dst_rawtype_size, default_value);
                 SourceType* typed_dst_ptr = reinterpret_cast<SourceType*>(dst_ptr);
                 Column::for_each_enumerated<SourceTDT>(src_column, [&](const auto& row) {
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
@@ -242,15 +242,25 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
     assert_frame_equal(left=expected, right=actual)
 
 
-def convert_arrow_to_pandas_and_remove_categoricals(table):
+def convert_arrow_to_pandas_for_tests(table):
+    """
+    Converts pa.Table outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a pd.DataFrame so it would be
+    identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires two changes:
+    - Replaces dictionary encoded string columns with regular string columns.
+    - Fills null values in int colums with zeros.
+    """
     new_table = stringify_dictionary_encoded_columns(table)
+    for i, name in enumerate(new_table.column_names):
+        if pa.types.is_integer(new_table.column(i).type):
+            new_col = new_table.column(i).fill_null(0)
+            new_table = new_table.set_column(i, name, new_col)
     return new_table.to_pandas()
 
 def assert_frame_equal_with_arrow(left, right, **kwargs):
     if isinstance(left, pa.Table):
-        left = convert_arrow_to_pandas_and_remove_categoricals(left)
+        left = convert_arrow_to_pandas_for_tests(left)
     if isinstance(right, pa.Table):
-        right = convert_arrow_to_pandas_and_remove_categoricals(right)
+        right = convert_arrow_to_pandas_for_tests(right)
     assert_frame_equal(left, right, **kwargs)
 
 
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -10,14 +10,15 @@
 from arcticdb.version_store.processing import QueryBuilder
 from arcticdb.options import OutputFormat
 import pyarrow as pa
+import pyarrow.compute as pc
 from arcticdb.util.hypothesis import (
     use_of_function_scoped_fixtures_in_hypothesis_checked,
     ENDIANNESS,
     supported_string_dtypes,
     dataframe_strategy,
     column_strategy,
 )
-from arcticdb.util.test import get_sample_dataframe
+from arcticdb.util.test import get_sample_dataframe, make_dynamic
 from arcticdb_ext.storage import KeyType
 from tests.util.mark import WINDOWS
 
@@ -625,3 +626,44 @@ def test_arrow_dynamic_schema_filtered_column(lmdb_version_store_dynamic_schema_
     q = q[q["col"] < 5]
     received = stringify_dictionary_encoded_columns(lib.read(sym, query_builder=q).data)
     assert expected.equals(received)
+
+
+def test_project_dynamic_schema(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+    table_1 = pa.table({"a": pa.array([1, 2])})
+    table_2 = pa.table({"a": pa.array([3, 4]), "b": pa.array([1, 2])})
+    table_3 = pa.table({"b": pa.array([3, 4])})
+    lib.write(sym, table_1.to_pandas())
+    lib.append(sym, table_2.to_pandas())
+    lib.append(sym, table_3.to_pandas())
+    q = QueryBuilder()
+    q = q.apply("c", q["a"] * q["b"] + 10)
+    received = lib.read(sym, query_builder=q).data
+    expected = pa.concat_tables([table_1, table_2, table_3], promote_options="permissive")
+    expected_new_col = pc.add(pc.multiply(expected.column("a"), expected.column("b")), 10)
+    expected = expected.append_column("c", expected_new_col)
+    assert expected.equals(received)
+
+
+def test_project_dynamic_schema_complex(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+    df = pd.DataFrame({
+        "int_col_1": np.arange(0, 10, dtype=np.int16),
+        "int_col_2": np.arange(10, 20, dtype=np.int32),
+        "float_col": np.arange(20, 30, dtype=np.float64),
+    })
+    expected, slices = make_dynamic(df)
+    for df_slice in slices:
+        lib.append(sym, df_slice, write_if_missing=True)
+
+    q = QueryBuilder()
+    q = q.apply("new_float_1", q["int_col_1"] / q["float_col"] + 1)
+    q = q.apply("new_float_2", q["int_col_2"] * q["new_float_1"])
+
+    table = lib.read(sym, query_builder=q).data
+    expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+    assert_frame_equal_with_arrow(table, expected)