Add resampling aggregation test with missing data

IvoDD · IvoDD · commit 720110a2ec06 · 2025-09-12T10:46:26.000+03:00
Also discovered an issue with appending an empty column set.
Added an xfail test for it and an issue 10029194063
diff --git a/python/tests/unit/arcticdb/version_store/test_append.py b/python/tests/unit/arcticdb/version_store/test_append.py
@@ -742,3 +742,15 @@ def test_append_series_with_different_row_range_index_name(lmdb_version_store_dy
     # See Monday 9797097831, it would be best to require that index names are always matching. This is the case for
     # datetime index because it's a physical column. It's a potentially breaking change.
     assert lib.read("sym").data.index.name == "index_name_2"
+
+
+@pytest.mark.xfail(reason="Wrong normalization metadata update. Monday ref: 10029194063")
+def test_append_no_columns(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    to_write = pd.DataFrame({"col" : [1, 2, 3]}, index=pd.date_range(pd.Timestamp(2025, 1, 1), periods=3))
+    to_append = pd.DataFrame({}, index=pd.date_range(pd.Timestamp(2025, 1, 4), periods=3))
+    lib.write("sym", to_write)
+    lib.append("sym", to_append)
+    expected = pd.concat([to_write, to_append])
+    result = lib.read("sym").data
+    assert_frame_equal(result, expected)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -705,3 +705,54 @@ def test_aggregation_empty_slices(lmdb_version_store_dynamic_schema_v1):
     assert pc.count(table.column("count_col"), mode="only_null").as_py() == 5
     expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
     assert_frame_equal_with_arrow(table, expected)
+
+
+def test_resample_empty_slices(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+    sym = "sym"
+    def gen_df(start, num_rows, with_columns=True):
+        data = {}
+        if with_columns:
+            data = {
+                "mean_col": np.arange(start, start+num_rows, dtype=np.float64),
+                "sum_col": np.arange(start, start+num_rows, dtype=np.float64),
+                "min_col": np.arange(start, start+num_rows, dtype=np.float64),
+                "max_col": np.arange(start, start+num_rows, dtype=np.float64),
+                "count_col": np.arange(start, start+num_rows, dtype=np.float64),
+            }
+        index = pd.date_range(pd.Timestamp(2025, 1, start), periods=num_rows)
+        return pd.DataFrame(data, index=index)
+
+    slices = [
+        gen_df(1, 3),
+        gen_df(4, 2, False), # We expect an entirely missing slice 4th-5th
+        gen_df(6, 3),
+        gen_df(9, 5, False), # We expect two missing slices 10th-11th and 12th-13th
+        gen_df(14, 2),
+        gen_df(16, 2, False), # We expect one missing slice 16th-17th
+        # TODO: If we don't finish with an append with columns our normalization metadata will be broken
+        gen_df(18, 1)
+    ]
+    for df_slice in slices:
+        lib.append(sym, df_slice, write_if_missing=True)
+
+    q = QueryBuilder()
+    q.resample("2d").agg({
+        "mean_col": "mean",
+        "sum_col": "sum",
+        "min_col": "min",
+        "max_col": "max",
+        "count_col": "count",
+    })
+
+    table = lib.read(sym, query_builder=q).data
+    # sum_col is correctly filled with 0s instead of nulls
+    assert pc.count(table.column("sum_col"), mode="only_null").as_py() == 0
+    # We expect 4 entirely empty buckets
+    assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("min_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("max_col"), mode="only_null").as_py() == 4
+    assert pc.count(table.column("count_col"), mode="only_null").as_py() == 4
+    expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+    assert_frame_equal_with_arrow(table, expected)