Fix resample empty bucket with date range

IvoDD · IvoDD · commit 4f3fdfd895b1 · 2025-10-02T12:21:53.000+03:00
In some fairly niche cases processing pipeline can produce empty
segments:
- Resampling with dynamic schema
- date range filter with no index values within an intersecting data key
  (fixed in a previous PR)

We previously tried to allocate a 0 sized memory block which was raising
assertions failures.

This PR just skips allocating the 0 sized memory blocks and adds an
arrow test to verify it works.
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -107,11 +107,17 @@ SegmentInMemory allocate_chunked_frame(const std::shared_ptr<PipelineContext>& c
     };
     auto handlers = TypeHandlerRegistry::instance();
 
-    if (row_count > 0) {
-        for (auto& column : output.columns()) {
-            auto handler = handlers->get_handler(output_format, column->type());
-            const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL);
-            for (auto block_row_count : block_row_counts) {
+    for (auto& column : output.columns()) {
+        auto handler = handlers->get_handler(output_format, column->type());
+        const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL);
+        for (auto block_row_count : block_row_counts) {
+            if (block_row_count > 0) {
+                // We can end up with empty segments from the processing pipeline, e.g. when:
+                // - Filtering a data key to the empty set (e.g. date_range = (3, 3) in a data key with no index=3)
+                // - Resampling with a date range with a bucket slice containing no indices
+                // 0 sized memory blocks would break the offset assumptions in chunked buffers, and it is fine to have
+                // number of memory blocks not equal number of segments because follow-up methods like
+                // `copy_frame_data_to_buffer` rely on offsets rather than block indices.
                 const auto bytes = block_row_count * data_size;
                 column->allocate_data(bytes);
                 column->advance_data(bytes);
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -904,3 +904,24 @@ def gen_df(start, num_rows, with_columns=True):
     assert pc.count(table.column("count_col"), mode="only_null").as_py() == 4
     expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
     assert_frame_equal_with_arrow(table, expected)
+
+
+def test_resample_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment):
+    # Closely mimics test_resampling_row_slice_responsible_for_no_buckets with arrow from test_resample.py
+    # TODO: Remove this test if we enable pipeline tests with arrow
+    lib = lmdb_version_store_tiny_segment
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+    df = pd.DataFrame(
+        {
+            "to_sum": [1, 2, 3, 4],
+        },
+        index=[pd.Timestamp(0), pd.Timestamp(100), pd.Timestamp(200), pd.Timestamp(3000)],
+    )
+    lib.write(sym, df)
+
+    q = QueryBuilder().resample("us").agg({"to_sum": ("to_sum", "sum")})
+    date_range = (pd.Timestamp(0), pd.Timestamp(1500))
+    table = lib.read(sym, date_range=date_range, query_builder=q).data
+    expected = pd.DataFrame({"to_sum": [6]}, index=[pd.Timestamp(0)])
+    assert_frame_equal_with_arrow(table, expected)