Fix arrow reading empty frames

IvoDD · IvoDD · commit 3ebb24a73609 · 2025-10-03T11:37:21.000+03:00
During symbol concat we can end up with a Segment with zero columns.
Convert that to arrow gracefully.

Also uncovered that arrow normalization doesn't correctly construct
pandas_metadata for empty dataframes. This is also fixed and tested in
this PR.
diff --git a/cpp/arcticdb/arrow/arrow_utils.cpp b/cpp/arcticdb/arrow/arrow_utils.cpp
@@ -63,6 +63,11 @@ std::vector<sparrow::array> arrow_arrays_from_column(const Column& column, std::
 std::shared_ptr<std::vector<sparrow::record_batch>> segment_to_arrow_data(SegmentInMemory& segment) {
     const auto total_blocks = segment.num_blocks();
     const auto num_columns = segment.num_columns();
+    if (num_columns == 0) {
+        // We can't construct a record batch with no columns, so in this case we return an empty list of record batches,
+        // which needs special handling in python.
+        return {};
+    }
     const auto column_blocks = segment.column(0).num_blocks();
     util::check(total_blocks == column_blocks * num_columns, "Expected regular block size");
 
diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py
@@ -742,7 +742,9 @@ def denormalize(self, item, norm_meta):
         index_type = pandas_meta.WhichOneof("index_type")
         if index_type == "index":
             index_meta = pandas_meta.index
-            if index_meta.is_physically_stored:
+            # Empty tables don't have `is_physically_stored=True` but we still output them with an empty DateTimeIndex.
+            is_empty_table_with_datetime_index = len(item) == 0 and not index_meta.step
+            if index_meta.is_physically_stored or is_empty_table_with_datetime_index:
                 pandas_indexes = 1
                 if index_meta.tz:
                     timezones[0] = index_meta.tz
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
@@ -2432,7 +2432,11 @@ def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem:
             record_batches = []
             for record_batch in frame_data.extract_record_batches():
                 record_batches.append(pa.RecordBatch._import_from_c(record_batch.array(), record_batch.schema()))
-            table = pa.Table.from_batches(record_batches)
+            if len(record_batches) == 0:
+                # We get an empty list of record batches when output has no columns
+                table = pa.Table.from_arrays([])
+            else:
+                table = pa.Table.from_batches(record_batches)
             data = self._arrow_normalizer.denormalize(table, read_result.norm)
         else:
             data = self._normalizer.denormalize(read_result.frame_data, read_result.norm)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -75,6 +75,35 @@ def test_bool_columns(lmdb_version_store_arrow):
     assert_frame_equal_with_arrow(table, df)
 
 
+def test_read_empty(lmdb_version_store_arrow):
+    lib = lmdb_version_store_arrow
+    sym = "sym"
+    df = pd.DataFrame()
+    lib.write(sym, df)
+    table = lib.read(sym).data
+    expected = lib.read(sym, output_format=OutputFormat.PANDAS).data
+    # During normalization when doing the write we attach an empty DateTimeIndex to the DataFrame. We correctly see it
+    # in arrow
+    assert table.column_names == ["index"]
+    assert table.shape == (0, 1)
+    # arcticdb read(output_format=PANDAS) produces `pd.RangeIndex(start=0, stop=0, step=1)` column index if no columns
+    # pyarrow to_pandas produces `pd.Index([])` if no columns.
+    expected.columns = pd.Index([])
+    assert_frame_equal_with_arrow(table, expected)
+
+
+def test_read_empty_with_columns(lmdb_version_store_arrow):
+    lib = lmdb_version_store_arrow
+    sym = "sym"
+    df = pd.DataFrame({"col_int": np.zeros(0, dtype=np.int32), "col_float": np.zeros(0, dtype=np.float64)})
+    lib.write(sym, df)
+    table = lib.read(sym).data
+    expected = lib.read(sym, output_format=OutputFormat.PANDAS).data
+    assert table.column_names == ["index", "col_int", "col_float"]
+    assert table.shape == (0, 3)
+    assert_frame_equal_with_arrow(table, expected)
+
+
 def test_column_filtering(lmdb_version_store_arrow):
     lib = lmdb_version_store_arrow
     df = pd.DataFrame({"x": np.arange(10), "y": np.arange(10.0, 20.0)})
@@ -925,3 +954,21 @@ def test_resample_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_s
     table = lib.read(sym, date_range=date_range, query_builder=q).data
     expected = pd.DataFrame({"to_sum": [6]}, index=[pd.Timestamp(0)])
     assert_frame_equal_with_arrow(table, expected)
+
+
+def test_symbol_concat_empty_intersection(lmdb_version_store_arrow):
+    # Tests a failing subset of test_symbol_concat_empty_column_intersection
+    # TODO: Remove this test if we enable pipeline tests with arrow
+    lib = lmdb_version_store_arrow
+    sym_0 = "sym_0"
+    sym_1 = "sym_1"
+    df_0 = pd.DataFrame({"col_0": [0]})
+    df_1 = pd.DataFrame({"col_1": [1]})
+    lib.write(sym_0, df_0)
+    lib.write(sym_1, df_1)
+    q = QueryBuilder().concat("inner")
+    table = lib.batch_read_and_join([sym_0, sym_1], query_builder=q).data
+    assert table.column_names == []
+    assert table.shape == (0, 0)
+    expected = pd.DataFrame()
+    assert_frame_equal_with_arrow(table, expected)