v1 API - batch_read_an_join specific tests (#2659)

grusev · web-flow · commit 62e9b316bfb9 · 2025-09-24T11:22:54.000+03:00
#### Reference Issues/PRs  #### What does this implement or fix? batch_read_an_join does not have explicit tests in our project. It is used in respective v2 API and there are the main tests for the functionality. This PR focuses on single scenario with many checks for this API for combination of its parameters and errors conditions. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>  --------- Co-authored-by: Georgi Rusev <Georgi Rusev>
diff --git a/python/tests/integration/arcticdb/version_store/test_basic_operations_scenarios.py b/python/tests/integration/arcticdb/version_store/test_basic_operations_scenarios.py
@@ -23,16 +23,20 @@
 from datetime import timedelta, timezone
 
 from arcticdb.exceptions import ArcticNativeException, SortingException
+from arcticdb.version_store.processing import QueryBuilder
 from arcticdb_ext.version_store import StreamDescriptorMismatch, NoSuchVersionException
 
 from arcticdb_ext.exceptions import (
     UnsortedDataException,
     InternalException,
     NormalizationException,
+    UserInputException,
+    MissingDataException,
+    SchemaException,
 )
 
 from benchmarks.bi_benchmarks import assert_frame_equal
-from tests.util.mark import LINUX, SLOW_TESTS_MARK
+from tests.util.mark import LINUX, SLOW_TESTS_MARK, WINDOWS
 
 
 def add_index(df: pd.DataFrame, start_time: pd.Timestamp):
@@ -584,6 +588,198 @@ def check_incomplete_staged(sym: str, remove_staged: bool = True) -> None:
     check_incomplete_staged(symbol)
 
 
+@pytest.mark.parametrize("dynamic_strings", [True, False])
+@pytest.mark.storage
+def test_batch_read_and_join_scenarios(basic_store_factory, dynamic_strings):
+    """The test covers usage of batch_read_and_join with multiple parameters and error conditions"""
+    lib: NativeVersionStore = basic_store_factory(dynamic_strings=dynamic_strings)
+
+    q = QueryBuilder()
+    q.concat("outer")
+    df0 = (
+        DFGenerator(size=20)
+        .add_bool_col("bool")
+        .add_float_col("A", np.float32)
+        .add_int_col("B", np.int32)
+        .add_int_col("C", np.uint16)
+        .generate_dataframe()
+    )
+
+    df0_1 = (
+        DFGenerator(size=20)
+        .add_bool_col("bool")
+        .add_float_col("A", np.float32)
+        .add_int_col("B", np.int16)
+        .add_int_col("C", np.uint16)
+        .add_string_col("str", 10, include_unicode=True)
+        .add_timestamp_col("ts")
+        .generate_dataframe()
+    )
+
+    df1_len = 13
+    df1 = (
+        DFGenerator(size=df1_len)
+        .add_bool_col("bool")
+        .add_float_col("A", np.float64)
+        .add_float_col("B", np.float32)
+        .add_int_col("C", np.int64)
+        .generate_dataframe()
+    )
+
+    df1_1 = (
+        DFGenerator(size=df1_len)
+        .add_bool_col("bool")
+        .add_string_col("A", 10)
+        .add_int_col("B", np.int64)
+        .generate_dataframe()
+    )
+
+    lib.write("symbol0", df0)
+    lib.write("symbol1", df1)
+
+    # Concatenate multiple times
+    data: pd.DataFrame = lib.batch_read_and_join(["symbol0", "symbol1", "symbol0", "symbol1"], query_builder=q).data
+    expected = pd.concat([df0, df1, df0, df1], ignore_index=True)
+    assert_frame_equal(expected, data)
+
+    # Concatenate with error QB
+    with pytest.raises(UserInputException):
+        data: pd.DataFrame = lib.batch_read_and_join(
+            ["symbol0", "symbol1"], query_builder=QueryBuilder(), as_ofs=[0, 0]
+        ).data
+
+    # Concatenate with column filter
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], query_builder=q, columns=[["A", "C", "none"], None]
+    ).data
+    df0_subset = df0[["A", "C"]]
+    expected = pd.concat([df0_subset, df1], ignore_index=True)
+    # Pandas concat will fill NaN for bools, Arcticdb is using False
+    expected["bool"] = expected["bool"].fillna(False)
+    assert_frame_equal(expected, data)
+
+    # Concatenate symbols with column filters + row range
+    # row range limit is beyond the len of data
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], query_builder=q, columns=[["A"], ["B"]], row_ranges=[(2, 3), (10, df1_len + 2)]
+    ).data
+    df0_subset = df0.loc[2:2, ["A"]]
+    df1_subset = df1.loc[10:df1_len, ["B"]]
+    expected = pd.concat([df0_subset, df1_subset], ignore_index=True)
+    assert_frame_equal(expected, data)
+
+    lib.write("symbol0", df0_1)
+    lib.write("symbol1", df1_1)
+
+    # Concatenate with wrong schema
+    with pytest.raises(SchemaException):
+        data: pd.DataFrame = lib.batch_read_and_join(["symbol0", "symbol1"], as_ofs=[1, 1], query_builder=q)
+
+    # Concatenate symbols with column filters + row range
+    # row range limit is beyond the len of data
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1", "symbol0"],
+        as_ofs=[0, 0, 1],
+        query_builder=q,
+        columns=[None, ["B", "C"], None],
+        row_ranges=[(2, 3), None, None],
+    ).data
+    df0_subset = df0.loc[2:2]
+    df1_subset = df1[["B", "C"]]
+    expected = pd.concat([df0_subset, df1_subset, df0_1], ignore_index=True)
+    # Pandas concat will fill NaN for bools, Arcticdb is using False
+    expected["bool"] = expected["bool"].fillna(False)
+    # Pandas concat will fill NaN for strings, Arcticdb is using None
+    if not dynamic_strings:
+        # make expected result like the actual due to static string
+        if not WINDOWS:
+            # windows does not have static strings
+            expected["str"] = expected["str"].fillna("")
+    assert_frame_equal(expected, data)
+
+    # Cover query builders per symbols
+    q0 = QueryBuilder()
+    q0 = q0[q0["A"] == 123.58743343]
+    q1 = QueryBuilder()
+    q1 = q1[q1["B"] == -3483.123434343]
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], as_ofs=[0, 0], query_builder=q, per_symbol_query_builders=[q0, q1]
+    ).data
+    assert len(data) == 0  # Nothing is selected
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], as_ofs=[0, 0], query_builder=q, per_symbol_query_builders=[q0, None]
+    ).data
+    assert_frame_equal(df1, data)
+
+
+@pytest.mark.xfail(True, reason="When non-existing symbol is used, MissingDataException is not raised 18023146743")
+def test_batch_read_and_join_scenarios_errors(basic_store):
+    lib: NativeVersionStore = basic_store
+
+    q = QueryBuilder()
+    q.concat("outer")
+    df0 = DFGenerator(size=20).add_bool_col("bool").generate_dataframe()
+
+    lib.write("symbol0", df0)
+
+    # Concatenate with missing symbol
+    with pytest.raises(MissingDataException):
+        data: pd.DataFrame = lib.batch_read_and_join(["symbol0", "symbol2"], query_builder=q).data
+
+
+@pytest.mark.storage
+@pytest.mark.xfail(True, reason="Filtering of columns does not work for dynamic schema 18023047637")
+def test_batch_read_and_join_scenarios_dynamic_schema_filtering_error(lmdb_version_store_dynamic_schema_v1):
+    lib: NativeVersionStore = lmdb_version_store_dynamic_schema_v1
+
+    q = QueryBuilder()
+    q.concat("outer")
+    df0 = (
+        DFGenerator(size=20)
+        .add_bool_col("bool")
+        .add_float_col("A", np.float32)
+        .add_int_col("B", np.int32)
+        .add_int_col("C", np.uint16)
+        .generate_dataframe()
+    )
+
+    df1_len = 13
+    df1 = (
+        DFGenerator(size=df1_len)
+        .add_bool_col("bool")
+        .add_float_col("A", np.float64)
+        .add_float_col("B", np.float32)
+        .add_int_col("C", np.int64)
+        .generate_dataframe()
+    )
+
+    lib.write("symbol0", df0)
+    lib.write("symbol1", df1)
+
+    # Concatenate with column filter
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], query_builder=q, columns=[["A", "C", "none"], None]
+    ).data
+    df0_subset = df0[["A", "C"]]
+    expected = pd.concat([df0_subset, df1], ignore_index=True)
+    # Pandas concat will fill NaN for bools, Arcticdb is using False
+    expected["bool"] = expected["bool"].fillna(False)
+    ## ERROR: With dynamic schema filtering of the columns will fail
+    #  here in the 'data' df instead of None/Na values for first 19 rows for
+    #  bool and B column we will see values, which should not have been there
+    #  If this was static schema - ie 'basic_store' fixture all would be fine
+    assert_frame_equal(expected, data)
+
+    data: pd.DataFrame = lib.batch_read_and_join(
+        ["symbol0", "symbol1"], query_builder=q, columns=[["A"], ["B"]], row_ranges=[(2, 3), (10, df1_len + 2)]
+    ).data
+    df0_subset = df0.loc[2:2, ["A"]]
+    df1_subset = df1.loc[10:df1_len, ["B"]]
+    expected = pd.concat([df0_subset, df1_subset], ignore_index=True)
+    # ERROR - here we observe that -/+ inf is added for int column "A"
+    assert_frame_equal(expected, data)
+
+
 def test_add_to_snapshot_and_remove_from_snapshots_scenarios(basic_store):
     lib: NativeVersionStore = basic_store
     lib.write("s1", 100)