From 90807d0dfb42b4b6f6c01d4d91761747b9ada561 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:16:48 +0000 Subject: [PATCH 1/7] Initial plan From 8ecc0d7864198853824618b93a4f3171cd6a8cf8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:24:39 +0000 Subject: [PATCH 2/7] Updated test_head.py and test_tail.py to use any_output_format fixture Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../unit/arcticdb/version_store/test_head.py | 58 ++++++++++++------- .../unit/arcticdb/version_store/test_tail.py | 56 +++++++++++------- 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_head.py b/python/tests/unit/arcticdb/version_store/test_head.py index d207e3bce7..c7ef3e6cfd 100644 --- a/python/tests/unit/arcticdb/version_store/test_head.py +++ b/python/tests/unit/arcticdb/version_store/test_head.py @@ -12,6 +12,7 @@ import pytest from arcticdb_ext.exceptions import InternalException +from arcticdb.util.test import assert_frame_equal_with_arrow pytestmark = pytest.mark.pipeline @@ -19,81 +20,97 @@ def generic_head_test(version_store, symbol, df, num_rows): version_store.write(symbol, df) - assert np.array_equal(df.head(num_rows), version_store.head(symbol, num_rows).data) + result = version_store.head(symbol, num_rows).data + expected = df.head(num_rows) + assert_frame_equal_with_arrow(expected, result) -def test_head_large_segment(lmdb_version_store): +def test_head_large_segment(lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) df = DataFrame({"x": np.arange(100_000, dtype=np.int64)}) generic_head_test(lmdb_version_store, "test_head_large_segment", df, 50_000) -def test_head_zero_num_rows(lmdb_version_store, one_col_df): +def test_head_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store.set_output_format(any_output_format) generic_head_test(lmdb_version_store, "test_head_zero_num_rows", one_col_df(), 0) -def test_head_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_one_num_rows", one_col_df(), 1) -def test_head_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_segment_boundary_num_rows", one_col_df(), 2) -def test_head_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_head_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_multiple_segments", one_col_df(), 7) -def test_head_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_negative_num_rows", one_col_df(), -7) -def test_head_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_num_rows_greater_than_table_length", one_col_df(), 10) -def test_head_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test( lmdb_version_store_tiny_segment, "test_head_negative_num_rows_equals_table_length", one_col_df(), -10 ) -def test_head_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_num_rows_greater_than_table_length", one_col_df(), 11) -def test_head_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_head_test( lmdb_version_store_tiny_segment, "test_head_negative_num_rows_greater_than_table_length", one_col_df(), -11 ) -def test_head_default_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_default_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) symbol = "test_head_default_num_rows" lmdb_version_store_tiny_segment.write(symbol, one_col_df()) num_rows = signature(lmdb_version_store_tiny_segment.head).parameters["n"].default - assert np.array_equal(one_col_df().head(num_rows), lmdb_version_store_tiny_segment.head(symbol).data) + result = lmdb_version_store_tiny_segment.head(symbol).data + expected = one_col_df().head(num_rows) + assert_frame_equal_with_arrow(expected, result) -def test_head_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): +def test_head_with_column_filter(lmdb_version_store_tiny_segment, three_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) symbol = "test_head_with_column_filter" lmdb_version_store_tiny_segment.write(symbol, three_col_df()) # lmdb_version_store_tiny_segment has column_group_size set to 2 num_rows = 5 # three_col_df generates a dataframe with 10 rows and 3 columns labelled x, y, and z columns = ["x", "z"] - assert np.array_equal( - three_col_df().filter(items=columns).head(num_rows), - lmdb_version_store_tiny_segment.head(symbol, num_rows, columns=columns).data, - ) + result = lmdb_version_store_tiny_segment.head(symbol, num_rows, columns=columns).data + expected = three_col_df().filter(items=columns).head(num_rows) + assert_frame_equal_with_arrow(expected, result) -def test_head_pickled_symbol(lmdb_version_store): +def test_head_pickled_symbol(lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) symbol = "test_head_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) @@ -102,8 +119,9 @@ def test_head_pickled_symbol(lmdb_version_store): @pytest.mark.parametrize("n", range(6)) -def test_dynamic_schema_head(lmdb_version_store_dynamic_schema, n): +def test_dynamic_schema_head(lmdb_version_store_dynamic_schema, n, any_output_format): lib = lmdb_version_store_dynamic_schema + lib.set_output_format(any_output_format) lib.write("sym", DataFrame({"a": [1, 2]}, index=[0, 1])) lib.append("sym", DataFrame({"b": [5, 6]}, index=[2, 3])) result = lib.head("sym", n=n).data diff --git a/python/tests/unit/arcticdb/version_store/test_tail.py b/python/tests/unit/arcticdb/version_store/test_tail.py index e13b160b98..61e3b46b2a 100644 --- a/python/tests/unit/arcticdb/version_store/test_tail.py +++ b/python/tests/unit/arcticdb/version_store/test_tail.py @@ -12,6 +12,7 @@ import pytest from arcticdb_ext.exceptions import InternalException +from arcticdb.util.test import assert_frame_equal_with_arrow pytestmark = pytest.mark.pipeline @@ -21,81 +22,95 @@ def generic_tail_test(version_store, symbol, df, num_rows): version_store.write(symbol, df) expected = df.tail(num_rows) actual = version_store.tail(symbol, num_rows).data - assert np.array_equal(expected, actual) + assert_frame_equal_with_arrow(expected, actual) -def test_tail_large_segment(lmdb_version_store): +def test_tail_large_segment(lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) df = DataFrame({"x": np.arange(100_000, dtype=np.int64)}) generic_tail_test(lmdb_version_store, "test_tail_large_segment", df, 50_000) -def test_tail_zero_num_rows(lmdb_version_store, one_col_df): +def test_tail_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store.set_output_format(any_output_format) generic_tail_test(lmdb_version_store, "test_tail_zero_num_rows", one_col_df(), 0) -def test_tail_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_one_num_rows", one_col_df(), 1) -def test_tail_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_segment_boundary_num_rows", one_col_df(), 2) -def test_tail_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_multiple_segments", one_col_df(), 7) -def test_tail_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_negative_num_rows", one_col_df(), -7) -def test_tail_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_num_rows_greater_than_table_length", one_col_df(), 10) -def test_tail_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_negative_num_rows_equals_table_length", one_col_df(), -10 ) -def test_tail_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_num_rows_greater_than_table_length", one_col_df(), 11) -def test_tail_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_negative_num_rows_greater_than_table_length", one_col_df(), -11 ) -def test_tail_default_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_default_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) symbol = "test_tail_default_num_rows" lmdb_version_store_tiny_segment.write(symbol, one_col_df()) num_rows = signature(lmdb_version_store_tiny_segment.tail).parameters["n"].default - assert np.array_equal(one_col_df().tail(num_rows), lmdb_version_store_tiny_segment.tail(symbol).data) + expected = one_col_df().tail(num_rows) + actual = lmdb_version_store_tiny_segment.tail(symbol).data + assert_frame_equal_with_arrow(expected, actual) -def test_tail_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): +def test_tail_with_column_filter(lmdb_version_store_tiny_segment, three_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) symbol = "test_tail_with_column_filter" lmdb_version_store_tiny_segment.write(symbol, three_col_df()) # lmdb_version_store_tiny_segment has column_group_size set to 2 num_rows = 5 # three_col_df generates a dataframe with 10 rows and 3 columns labelled x, y, and z columns = ["x", "z"] - assert np.array_equal( - three_col_df().filter(items=columns).tail(num_rows), - lmdb_version_store_tiny_segment.tail(symbol, num_rows, columns=columns).data, - ) + expected = three_col_df().filter(items=columns).tail(num_rows) + actual = lmdb_version_store_tiny_segment.tail(symbol, num_rows, columns=columns).data + assert_frame_equal_with_arrow(expected, actual) -def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment): +def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_multiple_segments_odd_total_rows", @@ -104,7 +119,8 @@ def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment): ) -def test_tail_pickled_symbol(lmdb_version_store): +def test_tail_pickled_symbol(lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) symbol = "test_tail_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) From 86eac3285718e57b39ea58aca877f29975472b3b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:31:12 +0000 Subject: [PATCH 3/7] Updated test_aggregation.py and test_projection.py to use any_output_format fixture Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../version_store/test_aggregation.py | 165 +++++++++++------- .../arcticdb/version_store/test_projection.py | 27 +-- 2 files changed, 122 insertions(+), 70 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_aggregation.py b/python/tests/unit/arcticdb/version_store/test_aggregation.py index 3bdfe466df..ccc6ef9f2b 100644 --- a/python/tests/unit/arcticdb/version_store/test_aggregation.py +++ b/python/tests/unit/arcticdb/version_store/test_aggregation.py @@ -14,6 +14,7 @@ from arcticdb_ext.exceptions import InternalException, SchemaException from arcticdb.util.test import ( assert_frame_equal, + assert_frame_equal_with_arrow, generic_aggregation_test, make_dynamic, common_sum_aggregation_dtype, @@ -23,18 +24,31 @@ pytestmark = pytest.mark.pipeline -def test_group_on_float_column_with_nans(lmdb_version_store_v1): +def aggregation_test_with_any_output_format(lib, symbol, df, grouping_column, aggs_dict): + """Helper function for aggregation tests that works with any output format.""" + expected = df.groupby(grouping_column).agg(aggs_dict) + expected = expected.reindex(columns=sorted(expected.columns)) + q = QueryBuilder().groupby(grouping_column).agg(aggs_dict) + received = lib.read(symbol, query_builder=q).data + received = received.reindex(columns=sorted(received.columns)) + received.sort_index(inplace=True) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) + + +def test_group_on_float_column_with_nans(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_group_on_float_column_with_nans" df = pd.DataFrame({"grouping_column": [1.0, 2.0, np.nan, 1.0, 2.0, 2.0], "agg_column": [1, 2, 3, 4, 5, 6]}) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"agg_column": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"agg_column": "sum"}) # TODO: Add first and last once un-feature flagged @pytest.mark.parametrize("aggregator", ("sum", "min", "max", "mean", "count")) -def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, aggregator): +def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, aggregator, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_aggregate_float_columns_with_nans" df = pd.DataFrame( { @@ -43,11 +57,12 @@ def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, aggregator): } ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"agg_column": aggregator}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"agg_column": aggregator}) -def test_count_aggregation(lmdb_version_store_v1): +def test_count_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_count_aggregation" df = DataFrame( { @@ -57,7 +72,7 @@ def test_count_aggregation(lmdb_version_store_v1): index=np.arange(6), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_count": "count"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_count": "count"}) @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") @@ -101,7 +116,7 @@ def test_last_aggregation(lmdb_version_store_v1): index=np.arange(9), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_last": "last"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_last": "last"}) @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") @@ -117,18 +132,20 @@ def test_last_agg_with_append(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, pd.concat([df_0, df_1, df_2]), "grouping_column", {"to_last": "last"}) -def test_sum_aggregation(lmdb_version_store_v1): +def test_sum_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_sum_aggregation" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]}, index=np.arange(5), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_bool(lmdb_version_store_v1): +def test_sum_aggregation_bool(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_sum_aggregation" df = DataFrame( { @@ -138,22 +155,24 @@ def test_sum_aggregation_bool(lmdb_version_store_v1): index=np.arange(9), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_mean_aggregation(lmdb_version_store_v1): +def test_mean_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_mean_aggregation" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_mean": [1, 1, 2, 2, 2]}, index=np.arange(5), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_mean_aggregation_float(lmdb_version_store_v1): +def test_mean_aggregation_float(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_mean_aggregation_float" df = DataFrame( { @@ -163,10 +182,11 @@ def test_mean_aggregation_float(lmdb_version_store_v1): index=np.arange(5), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_mean_aggregation_timestamp(lmdb_version_store_v1): +def test_mean_aggregation_timestamp(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_mean_aggregation_float" df = DataFrame( { @@ -191,10 +211,11 @@ def test_mean_aggregation_timestamp(lmdb_version_store_v1): index=np.arange(14), ) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_named_agg(lmdb_version_store_tiny_segment): +def test_named_agg(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) symbol = "test_named_agg" gen = np.random.default_rng() df = DataFrame( @@ -223,16 +244,18 @@ def test_named_agg(lmdb_version_store_tiny_segment): assert_frame_equal(expected, received, check_dtype=False) -def test_max_minus_one(lmdb_version_store_v1): +def test_max_minus_one(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_max_minus_one" df = pd.DataFrame({"grouping_column": ["thing"], "to_max": [-1]}) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_max": "max"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_max": "max"}) -def test_group_empty_dataframe(lmdb_version_store_v1): +def test_group_empty_dataframe(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_group_empty_dataframe" df = DataFrame({"grouping_column": [], "to_mean": []}) lib.write(symbol, df) @@ -241,8 +264,9 @@ def test_group_empty_dataframe(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_group_pickled_symbol(lmdb_version_store_v1): +def test_group_pickled_symbol(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_group_pickled_symbol" lib.write(symbol, np.arange(100).tolist()) assert lib.is_symbol_pickled(symbol) @@ -251,8 +275,9 @@ def test_group_pickled_symbol(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_group_column_not_present(lmdb_version_store_v1): +def test_group_column_not_present(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_group_column_not_present" df = DataFrame({"a": np.arange(2)}, index=np.arange(2)) lib.write(symbol, df) @@ -261,8 +286,9 @@ def test_group_column_not_present(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_group_column_splitting(lmdb_version_store_tiny_segment): +def test_group_column_splitting(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) symbol = "test_group_column_splitting" df = DataFrame( { @@ -274,7 +300,7 @@ def test_group_column_splitting(lmdb_version_store_tiny_segment): } ) lib.write(symbol, df) - generic_aggregation_test( + aggregation_test_with_any_output_format( lib, symbol, df, @@ -283,8 +309,9 @@ def test_group_column_splitting(lmdb_version_store_tiny_segment): ) -def test_group_column_splitting_strings(lmdb_version_store_tiny_segment): +def test_group_column_splitting_strings(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) symbol = "test_group_column_splitting" df = DataFrame( { @@ -296,7 +323,7 @@ def test_group_column_splitting_strings(lmdb_version_store_tiny_segment): } ) lib.write(symbol, df) - generic_aggregation_test( + aggregation_test_with_any_output_format( lib, symbol, df, @@ -305,8 +332,9 @@ def test_group_column_splitting_strings(lmdb_version_store_tiny_segment): ) -def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store_factory): +def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store_factory, any_output_format): lib = version_store_factory(column_group_size=2, segment_row_size=2, dynamic_strings=True) + lib.set_output_format(any_output_format) symbol = "test_aggregation_with_nones_and_nans_in_string_grouping_column" # Structured so that the row-slices of the grouping column contain: # 1 - All strings @@ -336,11 +364,12 @@ def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store index=np.arange(12), ) lib.write(symbol, df, dynamic_strings=True) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1): +def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) df = DataFrame({"grouping_column": ["group_1", "group_1", "group_1"], "to_max": [1, 5, 4]}, index=np.arange(3)) q = QueryBuilder() q = q.groupby("grouping_column").agg({"to_max": "max"}) @@ -352,8 +381,9 @@ def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1): assert_frame_equal(res.data, df) -def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store_v1): +def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1"], "to_mean": [1.1, 1.4, 2.5], "to_max": [1.1, 1.4, 2.5]}, index=np.arange(3), @@ -375,8 +405,9 @@ def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store ################################## -def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_count_aggregation_dynamic" df = DataFrame( { @@ -388,12 +419,13 @@ def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): df, slices = make_dynamic(df) for df_slice in slices: lib.append(symbol, df_slice) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_count": "count"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_count": "count"}) @pytest.mark.xfail(reason="Not supported yet") -def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_first_aggregation_dynamic" df = DataFrame( { @@ -405,12 +437,13 @@ def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): df, slices = make_dynamic(df) for df_slice in slices: lib.append(symbol, df_slice) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_first": "first"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_first": "first"}) @pytest.mark.xfail(reason="Not supported yet") -def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_last_aggregation_dynamic" df = DataFrame( { @@ -422,11 +455,12 @@ def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): df, slices = make_dynamic(df) for df_slice in slices: lib.append(symbol, df_slice) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_last": "last"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_last": "last"}) -def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_sum_aggregation_dynamic" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]}, @@ -435,18 +469,20 @@ def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): df, slices = make_dynamic(df) for df_slice in slices: lib.append(symbol, df_slice, write_if_missing=True) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_dynamic_bool_missing_aggregated_column(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_dynamic_bool_missing_aggregated_column(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_sum_aggregation_dynamic" df = DataFrame({"grouping_column": ["group_1", "group_2"], "to_sum": [True, False]}, index=np.arange(2),) lib.write(symbol, df) lib.append(symbol, pd.DataFrame({"grouping_column": ["group_1", "group_2"]}, index=np.arange(2))) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_sum_aggregation_with_range_index_dynamic" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]} @@ -454,11 +490,12 @@ def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_sch df, slices = make_dynamic(df) for df_slice in slices: lib.append(symbol, df_slice) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_group_empty_dataframe_dynamic" df = DataFrame({"grouping_column": [], "to_mean": []}) lib.write(symbol, df) @@ -467,8 +504,9 @@ def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1): lib.read(symbol, query_builder=q) -def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_group_pickled_symbol_dynamic" lib.write(symbol, np.arange(100).tolist()) assert lib.is_symbol_pickled(symbol) @@ -477,8 +515,9 @@ def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1): lib.read(symbol, query_builder=q) -def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_group_column_not_present_dynamic" df = DataFrame({"a": np.arange(2)}, index=np.arange(2)) lib.write(symbol, df) @@ -488,46 +527,51 @@ def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1): @pytest.mark.parametrize("agg", ("max", "min", "mean", "sum")) -def test_segment_without_aggregation_column(lmdb_version_store_dynamic_schema_v1, agg): +def test_segment_without_aggregation_column(lmdb_version_store_dynamic_schema_v1, agg, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_segment_without_aggregation_column" write_df = pd.DataFrame({"grouping_column": ["group_0"], "aggregation_column": [10330.0]}) lib.write(symbol, write_df) append_df = pd.DataFrame({"grouping_column": ["group_1"]}) lib.append(symbol, append_df) - generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"aggregation_column": agg}) + aggregation_test_with_any_output_format(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"aggregation_column": agg}) -def test_minimal_repro_type_change(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_change(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_minimal_repro_type_change" write_df = pd.DataFrame({"grouping_column": ["group_1"], "to_sum": [np.uint8(1)]}) lib.write(symbol, write_df) append_df = pd.DataFrame({"grouping_column": ["group_1"], "to_sum": [1.5]}) lib.append(symbol, append_df) - generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_sum": "sum"}) -def test_minimal_repro_type_change_max(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_change_max(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_minimal_repro_type_change_max" write_df = pd.DataFrame({"grouping_column": ["group_1"], "to_max": [np.uint8(1)]}) lib.write(symbol, write_df) append_df = pd.DataFrame({"grouping_column": ["group_1"], "to_max": [0.5]}) lib.append(symbol, append_df) - generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_max": "max"}) + aggregation_test_with_any_output_format(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_max": "max"}) -def test_minimal_repro_type_sum_similar_string_group_values(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_sum_similar_string_group_values(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_minimal_repro_type_sum_similar_string_group_values" df = pd.DataFrame({"grouping_column": ["0", "000"], "to_sum": [1.0, 1.0]}) lib.write(symbol, df) - generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_dynamic_schema_v1): +def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_aggregation_grouping_column_missing_from_row_group" write_df = DataFrame( {"to_sum": [1, 2], "grouping_column": ["group_1", "group_2"]}, @@ -539,13 +583,13 @@ def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_d index=np.arange(2, 4), ) lib.append(symbol, append_df) - generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_sum": "sum"}) + aggregation_test_with_any_output_format(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_sum": "sum"}) @pytest.mark.parametrize("first_dtype,", [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64, bool]) @pytest.mark.parametrize("second_dtype", [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64, bool]) @pytest.mark.parametrize("first_group", ["0", "1"]) @pytest.mark.parametrize("second_group", ["0", "1"]) -def test_sum_aggregation_type(lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype, first_group, second_group): +def test_sum_aggregation_type(lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype, first_group, second_group, any_output_format): """ Sum aggregation promotes to the largest type of the respective category. int -> int64, uint -> uint64, float -> float64 Dynamic schema allows mixing int and uint. In the case of sum aggregation, this will require mixing uint64 and int64 @@ -553,6 +597,7 @@ def test_sum_aggregation_type(lmdb_version_store_dynamic_schema_v1, first_dtype, test we test all configurations of dtypes and grouping options (same group vs different group) """ lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) df1 = pd.DataFrame({"grouping_column": [first_group], "to_sum": np.array([1], first_dtype)}) df2 = pd.DataFrame({"grouping_column": [second_group], "to_sum": np.array([1], second_dtype)}) lib.write("sym", df1) @@ -576,13 +621,14 @@ def test_sum_aggregation_type(lmdb_version_store_dynamic_schema_v1, first_dtype, @pytest.mark.parametrize("extremum", ["min", "max"]) @pytest.mark.parametrize("dtype, default_value", [(np.int32, 0), (np.float32, np.nan), (bool, False)]) -def test_extremum_aggregation_with_missing_aggregation_column(lmdb_version_store_dynamic_schema_v1, extremum, dtype, default_value): +def test_extremum_aggregation_with_missing_aggregation_column(lmdb_version_store_dynamic_schema_v1, extremum, dtype, default_value, any_output_format): """ Test that a sparse column will be backfilled with the correct values. d1 will be skipped because there is no grouping colum, df2 will form the first row which. The first row is sparse because the aggregation column is missing, d2 will be the second row which will be dense and not backfilled. """ lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) sym = "sym" df1 = pd.DataFrame({"agg_column": np.array([0, 0], dtype)}) df2 = pd.DataFrame({"grouping_column": ["a"]}) @@ -598,8 +644,9 @@ def test_extremum_aggregation_with_missing_aggregation_column(lmdb_version_store expected = expected.sort_index() assert_frame_equal(data, expected) -def test_mean_timestamp_aggregation_with_missing_aggregation_column(lmdb_version_store_dynamic_schema_v1): +def test_mean_timestamp_aggregation_with_missing_aggregation_column(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) sym = "sym" df1 = pd.DataFrame({"agg": [pd.Timestamp(1)], "grouping": [0]}) df2 = pd.DataFrame({"grouping": [0, 1, 2]}) diff --git a/python/tests/unit/arcticdb/version_store/test_projection.py b/python/tests/unit/arcticdb/version_store/test_projection.py index 4af8a82138..80c4cebc10 100644 --- a/python/tests/unit/arcticdb/version_store/test_projection.py +++ b/python/tests/unit/arcticdb/version_store/test_projection.py @@ -12,7 +12,7 @@ from arcticdb_ext.exceptions import InternalException, UserInputException from arcticdb.exceptions import ArcticNativeException from arcticdb.version_store.processing import QueryBuilder -from arcticdb.util.test import assert_frame_equal, make_dynamic, regularize_dataframe +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow, make_dynamic, regularize_dataframe pytestmark = pytest.mark.pipeline @@ -62,15 +62,16 @@ def test_project_string_unary_arithmetic(lmdb_version_store_v1): @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=3)]) @pytest.mark.parametrize("value", [5, "hello"]) -def test_project_fixed_value(lmdb_version_store_tiny_segment, index, value): +def test_project_fixed_value(lmdb_version_store_tiny_segment, index, value, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) sym = "test_project_fixed_value" df = pd.DataFrame({"col1": [0, 1, 2], "col2": [3, 4, 5], "col3": [6, 7, 8]}, index=index) lib.write(sym, df) df["new_col"] = value q = QueryBuilder().apply("new_col", value) received = lib.read(sym, query_builder=q).data - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) def test_project_value_set(): @@ -78,8 +79,9 @@ def test_project_value_set(): QueryBuilder().apply("new_col", [0, 1, 2]) -def test_docstring_example_query_builder_apply(lmdb_version_store_v1): +def test_docstring_example_query_builder_apply(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) df = pd.DataFrame( { "VWAP": np.arange(0, 10, dtype=np.float64), @@ -96,7 +98,7 @@ def test_docstring_example_query_builder_apply(lmdb_version_store_v1): data = lib.read("expression", query_builder=q).data df["ADJUSTED"] = df["ASK"] * df["VOL_ACC"] + 7 - assert_frame_equal(df.astype({"ADJUSTED": "int64"}), data) + assert_frame_equal_with_arrow(df.astype({"ADJUSTED": "int64"}), data) ################################## @@ -104,8 +106,9 @@ def test_docstring_example_query_builder_apply(lmdb_version_store_v1): ################################## -def test_project_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_project_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_project_dynamic" df = pd.DataFrame( @@ -128,11 +131,12 @@ def test_project_dynamic(lmdb_version_store_dynamic_schema_v1): expected["ADJUSTED"] = expected["ASK"] * expected["ACVOL"] + 7 received = regularize_dataframe(vit.data) expected = regularize_dataframe(expected) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_schema): +def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_schema, any_output_format): lib = lmdb_version_store_dynamic_schema + lib.set_output_format(any_output_format) symbol = "test_project_column_types_changing_and_missing" # Floats expected = pd.DataFrame({"col_to_project": [0.5, 1.5], "data_col": [0, 1]}, index=np.arange(0, 2)) @@ -156,13 +160,14 @@ def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_sc q = QueryBuilder() q = q.apply("projected_col", q["col_to_project"] * 2) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("index", [None, "timeseries"]) @pytest.mark.parametrize("value", [5, "hello"]) -def test_project_fixed_value_dynamic(lmdb_version_store_dynamic_schema_v1, index, value): +def test_project_fixed_value_dynamic(lmdb_version_store_dynamic_schema_v1, index, value, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) sym = "test_project_fixed_value_dynamic" df0 = pd.DataFrame({"col1": [0, 0.1, 0.2], "col2": [0.3, 0.4, 0.5]}, index=pd.date_range("2025-01-01", periods=3) if index == "timeseries" else None) df1 = pd.DataFrame({"col2": [0.6, 0.7, 0.8]}, index=pd.date_range("2025-01-04", periods=3) if index == "timeseries" else None) @@ -174,4 +179,4 @@ def test_project_fixed_value_dynamic(lmdb_version_store_dynamic_schema_v1, index expected.index = pd.RangeIndex(6) q = QueryBuilder().apply("new_col", value) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) From 45d305ac0fc1f4a5762461387f2ed6d98ef5fe59 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:34:49 +0000 Subject: [PATCH 4/7] Updated test_filtering.py and test_row_range.py to use any_output_format fixture Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../arcticdb/version_store/test_filtering.py | 133 ++++++++++++------ .../arcticdb/version_store/test_row_range.py | 54 ++++--- 2 files changed, 119 insertions(+), 68 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py index 0fb5118919..806f4a0288 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering.py @@ -20,6 +20,7 @@ from arcticdb.version_store.processing import QueryBuilder from arcticdb.util.test import ( assert_frame_equal, + assert_frame_equal_with_arrow, config_context, get_wide_dataframe, make_dynamic, @@ -37,6 +38,26 @@ pytestmark = pytest.mark.pipeline +def filter_test_with_any_output_format(lib, symbol, arctic_query, expected): + """Helper function for filter tests that works with any output format.""" + received = lib.read(symbol, query_builder=arctic_query).data + assert_frame_equal_with_arrow(expected, received) + + +def filter_test_strings_with_any_output_format(lib, base_symbol, arctic_query, expected): + """Helper function for string filter tests that works with any output format.""" + for symbol in [f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", f"{base_symbol}_{FIXED_STRINGS_SUFFIX}"]: + arctic_query.optimise_for_speed() + filter_test_with_any_output_format(lib, symbol, arctic_query, expected) + arctic_query.optimise_for_memory() + filter_test_with_any_output_format(lib, symbol, arctic_query, expected) + + +def filter_test_nans_with_any_output_format(lib, symbol, arctic_query, expected): + """Helper function for NaN filter tests that works with any output format.""" + filter_test_with_any_output_format(lib, symbol, arctic_query, expected) + + def test_filter_column_not_present(lmdb_version_store_v1): lib = lmdb_version_store_v1 df = pd.DataFrame({"a": np.arange(2)}, index=np.arange(2)) @@ -48,15 +69,16 @@ def test_filter_column_not_present(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_filter_column_attribute_syntax(lmdb_version_store_v1): +def test_filter_column_attribute_syntax(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_column_attribute_syntax" df = pd.DataFrame({"a": [np.uint8(1), np.uint8(0)]}) lib.write(symbol, df) q = QueryBuilder() q = q[q.a < np.uint8(1)] expected = df[df["a"] < np.uint8(1)] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) def test_filter_infinite_value(): @@ -97,8 +119,9 @@ def test_filter_explicit_index(lmdb_version_store_v1): assert_frame_equal(df.query(pandas_query), lib.read(symbol, query_builder=q).data) -def test_filter_clashing_values(lmdb_version_store_v1): +def test_filter_clashing_values(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) base_symbol = "test_filter_clashing_values" df = pd.DataFrame({"a": [10, 11, 12], "b": ["11", "12", "13"]}, index=np.arange(3)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -106,7 +129,7 @@ def test_filter_clashing_values(lmdb_version_store_v1): q = QueryBuilder() q = q[(q.a == 11) | (q.b == "11")] expected = df[(df["a"] == 11) | (df["b"] == "11")] - generic_filter_test_strings(lib, base_symbol, q, expected) + filter_test_strings_with_any_output_format(lib, base_symbol, q, expected) def test_filter_bool_nonbool_comparison(lmdb_version_store_v1): @@ -147,41 +170,45 @@ def test_filter_bool_nonbool_comparison(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_filter_bool_column(lmdb_version_store_v1): +def test_filter_bool_column(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_bool_column" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) q = QueryBuilder() q = q[q["a"]] expected = df[df["a"]] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_bool_column_not(lmdb_version_store_v1): +def test_filter_bool_column_not(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_bool_column_not" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) q = QueryBuilder() q = q[~q["a"]] expected = df[~df["a"]] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_bool_column_binary_boolean(lmdb_version_store_v1): +def test_filter_bool_column_binary_boolean(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_bool_column_binary_boolean" df = pd.DataFrame({"a": [True, True, False, False], "b": [True, False, True, False]}, index=np.arange(4)) lib.write(symbol, df) q = QueryBuilder() q = q[q["a"] & q["b"]] expected = df[df["a"] & df["b"]] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_bool_column_comparison(lmdb_version_store_v1): +def test_filter_bool_column_comparison(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_bool_column_comparison" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) @@ -208,11 +235,12 @@ def test_filter_bool_column_comparison(lmdb_version_store_v1): elif comparator == ">=": q = q[q["a"] >= bool_value] expected = df[df["a"] >= bool_value] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_datetime_naive(lmdb_version_store_v1): +def test_filter_datetime_naive(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_datetime_simple" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10)}) lib.write(symbol, df) @@ -221,11 +249,12 @@ def test_filter_datetime_naive(lmdb_version_store_v1): q = QueryBuilder() q = q[q["a"] < ts] expected = df[df["a"] < ts] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_datetime_isin(lmdb_version_store_v1): +def test_filter_datetime_isin(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_datetime_isin" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10)}) lib.write(symbol, df) @@ -234,7 +263,7 @@ def test_filter_datetime_isin(lmdb_version_store_v1): q = QueryBuilder() q = q[q["a"] == [ts]] expected = df[df["a"].isin([ts])] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) def test_filter_datetime_timedelta(lmdb_version_store_v1): @@ -260,8 +289,9 @@ def test_filter_datetime_timedelta(lmdb_version_store_v1): assert True -def test_filter_datetime_timezone_aware(lmdb_version_store_v1): +def test_filter_datetime_timezone_aware(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_datetime_timezone_aware" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10, tz=timezone("Europe/Amsterdam"))}) lib.write(symbol, df) @@ -272,7 +302,7 @@ def test_filter_datetime_timezone_aware(lmdb_version_store_v1): expected = df[df["a"] < ts] # Convert to UTC and strip tzinfo to match behaviour of roundtripping through Arctic expected["a"] = expected["a"].apply(lambda x: x.tz_convert(timezone("utc")).tz_localize(None)) - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) def test_df_query_wrong_type(lmdb_version_store_v1): @@ -362,8 +392,9 @@ def test_filter_datetime_nanoseconds(lmdb_version_store_v1): assert_frame_equal(second_and_third_row_result, df.iloc[[1, 2]].reset_index(drop=True)) -def test_filter_isin_clashing_sets(lmdb_version_store_v1): +def test_filter_isin_clashing_sets(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_isin_clashing_sets" a_unique_val = 100000 b_unique_val = 200000 @@ -377,7 +408,7 @@ def test_filter_isin_clashing_sets(lmdb_version_store_v1): assert str(vals1) == str(vals2) q = q[(q["a"].isin(vals1)) | (q["b"].isin(vals2))] expected = df[(df["a"].isin(vals1)) | (df["b"].isin(vals2))] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) @pytest.mark.parametrize( @@ -441,8 +472,9 @@ def test_filter_numeric_isnotin_hashing_overflow(lmdb_version_store_v1): @pytest.mark.parametrize("op", ("in", "not in")) @pytest.mark.parametrize("signed_type", (np.int8, np.int16, np.int32, np.int64)) @pytest.mark.parametrize("uint64_in", ("df", "vals") if PANDAS_VERSION >= Version("1.2") else ("vals",)) -def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store_v1, op, signed_type, uint64_in): +def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store_v1, op, signed_type, uint64_in, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_numeric_membership_mixing_int64_and_uint64" signed = signed_type(-1) if uint64_in == "df": @@ -454,7 +486,7 @@ def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store_v1 q = QueryBuilder() q = q[q["a"].isin(vals) if op == "in" else q["a"].isnotin(vals)] expected = df[df["a"].isin(vals) if op == "in" else ~df["a"].isin(vals)] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) def test_filter_nones_and_nans_retained_in_string_column(lmdb_version_store_v1): @@ -474,8 +506,9 @@ def test_filter_nones_and_nans_retained_in_string_column(lmdb_version_store_v1): # Tests that false matches aren't generated when list members truncate to column values -def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1): +def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_fixed_width_string_isin_truncation" df = pd.DataFrame({"a": ["1"]}, index=np.arange(1)) lib.write(symbol, df, dynamic_strings=False) @@ -483,10 +516,10 @@ def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1): q = QueryBuilder() q = q[q["a"].isin(vals)] expected = df[df["a"].isin(vals)] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): +def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment, any_output_format): # Construct a dataframe and QueryBuilder pair with the following properties: # - original dataframe spanning multiple segments horizontally and vertically (tiny segment == 2x2) # - strings of varying lengths to exercise fixed width strings more completely @@ -495,6 +528,7 @@ def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): # - at least one segment will need none of the strings in it's pool after filtering # - at least one segment will need some, but not all of the strings in it's pool after filtering lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) base_symbol = "test_filter_stringpool_shrinking_basic" df = pd.DataFrame( { @@ -510,11 +544,12 @@ def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): q = q[q["a"] != "a1"] q.optimise_for_memory() expected = df[df["a"] != "a1"] - generic_filter_test_strings(lib, base_symbol, q, expected) + filter_test_strings_with_any_output_format(lib, base_symbol, q, expected) -def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1): +def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) base_symbol = "test_filter_stringpool_shrinking_block_alignment" # Create a dataframe with more than one block (3968 bytes) worth of strings for the stringpool string_length = 10 @@ -527,7 +562,7 @@ def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1): string_to_find = data[3] q = q[q["a"] == string_to_find] expected = df[df["a"] == string_to_find] - generic_filter_test_strings(lib, base_symbol, q, expected) + filter_test_strings_with_any_output_format(lib, base_symbol, q, expected) def test_filter_explicit_type_promotion(lmdb_version_store_v1): @@ -642,8 +677,9 @@ def test_filter_column_slicing_different_segments(lmdb_version_store_tiny_segmen assert np.array_equal(expected, received) -def test_filter_with_multi_index(lmdb_version_store_v1): +def test_filter_with_multi_index(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_with_multi_index" dt1 = datetime(2019, 4, 8, 10, 5, 2, 1) dt2 = datetime(2019, 4, 9, 10, 5, 2, 1) @@ -656,11 +692,12 @@ def test_filter_with_multi_index(lmdb_version_store_v1): q = QueryBuilder() q = q[(q["a"] == 11) | (q["a"] == 13)] expected = df[(df["a"] == 11) | (df["a"] == 13)] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_on_multi_index(lmdb_version_store_v1): +def test_filter_on_multi_index(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_on_multi_index" dt1 = datetime(2019, 4, 8, 10, 5, 2, 1) dt2 = datetime(2019, 4, 9, 10, 5, 2, 1) @@ -673,11 +710,12 @@ def test_filter_on_multi_index(lmdb_version_store_v1): q = QueryBuilder() q = q[q["level"] == 1] expected = df.query("level == 1") - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) -def test_filter_complex_expression(lmdb_version_store_tiny_segment): +def test_filter_complex_expression(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) symbol = "test_filter_complex_expression" df = pd.DataFrame( { @@ -691,7 +729,7 @@ def test_filter_complex_expression(lmdb_version_store_tiny_segment): q = QueryBuilder() q = q[(((q["a"] * q["b"]) / 5) < (0.7 * q["c"])) & (q["b"] != 12)] expected = df[(((df["a"] * df["b"]) / 5) < (0.7 * df["c"])) & (df["b"] != 12)] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) def test_filter_string_backslash(lmdb_version_store_v1): @@ -774,8 +812,9 @@ def test_filter_string_greater_than_equal(lmdb_version_store_v1): lib.read(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", query_builder=q).data -def test_filter_string_nans_col_val(lmdb_version_store_v1): +def test_filter_string_nans_col_val(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_string_nans_col_val" df = pd.DataFrame({"a": ["row1", "row2", None, np.nan, math.nan]}, index=np.arange(5)) lib.write(symbol, df, dynamic_strings=True) @@ -783,36 +822,37 @@ def test_filter_string_nans_col_val(lmdb_version_store_v1): q = QueryBuilder() q = q[q["a"] == "row2"] expected = df[df["a"] == "row2"] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] != "row2"] expected = df[df["a"] != "row2"] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] == ["row2"]] expected = df[df["a"].isin(["row2"])] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] != ["row2"]] expected = df[~df["a"].isin(["row2"])] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] == []] expected = df[df["a"].isin([])] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] != []] expected = df[~df["a"].isin([])] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) -def test_filter_string_nans_col_col(lmdb_version_store_v1): +def test_filter_string_nans_col_col(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_string_nans_col_col" # Compare all combinations of string, None, np.nan, and math.nan to one another df = pd.DataFrame( @@ -827,12 +867,12 @@ def test_filter_string_nans_col_col(lmdb_version_store_v1): q = QueryBuilder() q = q[q["a"] == q["b"]] expected = df[df["a"] == df["b"]] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) q = QueryBuilder() q = q[q["a"] != q["b"]] expected = df[df["a"] != df["b"]] - generic_filter_test_nans(lib, symbol, q, expected) + filter_test_nans_with_any_output_format(lib, symbol, q, expected) @pytest.mark.parametrize("method", ("isna", "notna", "isnull", "notnull")) @@ -1065,8 +1105,9 @@ def test_filter_string_number_set_membership(lmdb_version_store_v1): # float32 comparisons are excluded from the hypothesis tests due to a bug in Pandas, so cover these here instead # https://github.com/pandas-dev/pandas/issues/59524 -def test_float32_binary_comparison(lmdb_version_store_v1): +def test_float32_binary_comparison(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_float32_binary_comparison" df = pd.DataFrame( { @@ -1108,7 +1149,7 @@ def test_float32_binary_comparison(lmdb_version_store_v1): elif op == "!=": q = q[qb_lhs != qb_rhs] expected = df[pandas_lhs != pandas_rhs] - generic_filter_test(lib, symbol, q, expected) + filter_test_with_any_output_format(lib, symbol, q, expected) ################################ diff --git a/python/tests/unit/arcticdb/version_store/test_row_range.py b/python/tests/unit/arcticdb/version_store/test_row_range.py index 3744e7239f..1b1d76c5d0 100644 --- a/python/tests/unit/arcticdb/version_store/test_row_range.py +++ b/python/tests/unit/arcticdb/version_store/test_row_range.py @@ -12,12 +12,13 @@ from arcticdb.version_store.processing import QueryBuilder from arcticdb_ext.exceptions import InternalException -from arcticdb.util.test import assert_frame_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow pytestmark = pytest.mark.pipeline -def generic_row_range_test(version_store, symbol, df, start_row, end_row): +def row_range_test_with_any_output_format(version_store, symbol, df, start_row, end_row): + """Helper function for row range tests that works with any output format.""" version_store.write(symbol, df) expected_array = df.iloc[start_row:end_row] @@ -25,50 +26,56 @@ def generic_row_range_test(version_store, symbol, df, start_row, end_row): q = QueryBuilder().row_range((start_row, end_row)) received_array_via_querybuilder = version_store.read(symbol, query_builder=q).data - np.testing.assert_array_equal(expected_array, received_array) - np.testing.assert_array_equal(expected_array, received_array_via_querybuilder) + assert_frame_equal_with_arrow(expected_array, received_array) + assert_frame_equal_with_arrow(expected_array, received_array_via_querybuilder) expected_array = df.iloc[-end_row:-start_row] received_array = version_store.read(symbol, row_range=(-end_row, -start_row)).data q = QueryBuilder().row_range((-end_row, -start_row)) received_array_via_querybuilder = version_store.read(symbol, query_builder=q).data - np.testing.assert_array_equal(expected_array, received_array) - np.testing.assert_array_equal(expected_array, received_array_via_querybuilder) + assert_frame_equal_with_arrow(expected_array, received_array) + assert_frame_equal_with_arrow(expected_array, received_array_via_querybuilder) -def test_row_range_start_row_greater_than_end_row(lmdb_version_store, one_col_df): - generic_row_range_test(lmdb_version_store, "test_row_range_start_row_greater_than_end_row", one_col_df(), 3, 2) +def test_row_range_start_row_greater_than_end_row(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store, "test_row_range_start_row_greater_than_end_row", one_col_df(), 3, 2) -def test_row_range_zero_num_rows(lmdb_version_store, one_col_df): - generic_row_range_test(lmdb_version_store, "test_row_range_zero_num_rows", one_col_df(), 2, 2) +def test_row_range_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store, "test_row_range_zero_num_rows", one_col_df(), 2, 2) -def test_row_range_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): - generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_one_num_rows", one_col_df(), 2, 3) +def test_row_range_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store_tiny_segment, "test_row_range_one_num_rows", one_col_df(), 2, 3) -def test_row_range_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 - generic_row_range_test( + row_range_test_with_any_output_format( lmdb_version_store_tiny_segment, "test_row_range_segment_boundary_num_rows", one_col_df(), 2, 4 ) -def test_row_range_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # lmdb_version_store_tiny_segment has segment_row_size set to 2 - generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_multiple_segments", one_col_df(), 3, 7) + lmdb_version_store_tiny_segment.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store_tiny_segment, "test_row_range_multiple_segments", one_col_df(), 3, 7) -def test_row_range_all_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_all_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows - generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_all_rows", one_col_df(), 0, 10) + lmdb_version_store_tiny_segment.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store_tiny_segment, "test_row_range_all_rows", one_col_df(), 0, 10) -def test_row_range_past_end(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_past_end(lmdb_version_store_tiny_segment, one_col_df, any_output_format): # one_col_df generates a dataframe with 10 rows - generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_past_end", one_col_df(), 5, 15) + lmdb_version_store_tiny_segment.set_output_format(any_output_format) + row_range_test_with_any_output_format(lmdb_version_store_tiny_segment, "test_row_range_past_end", one_col_df(), 5, 15) def test_row_range_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): @@ -82,15 +89,17 @@ def test_row_range_with_column_filter(lmdb_version_store_tiny_segment, three_col assert np.array_equal( three_col_df().filter(items=columns).iloc[start_row:end_row], lmdb_version_store_tiny_segment.read(symbol, row_range=(start_row, end_row), columns=columns).data, + lmdb_version_store_tiny_segment.read(symbol, row_range.set_output_format(any_output_format) ) -def test_row_range_pickled_symbol(lmdb_version_store): +def test_row_range_pickled_symbol(lmdb_version_store, any_output_format): symbol = "test_row_range_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) with pytest.raises(InternalException): _ = lmdb_version_store.read(symbol, row_range=(1, 2)) + _.set_output_format(any_output_format) @pytest.mark.parametrize("row_range,expected", ( @@ -104,7 +113,7 @@ def test_row_range_pickled_symbol(lmdb_version_store): ((5, 3), pd.DataFrame({"a": []}, dtype=np.int64)), )) @pytest.mark.parametrize("api", ("query_builder", "read", "read_batch")) -def test_row_range_open_ended(lmdb_version_store_v1, api, row_range, expected): +def test_row_range_open_ended(lmdb_version_store_v1, api, row_range, expected, any_output_format): symbol = "test_row_range" df = pd.DataFrame({"a": np.arange(100)}) lmdb_version_store_v1.write(symbol, df) @@ -112,6 +121,7 @@ def test_row_range_open_ended(lmdb_version_store_v1, api, row_range, expected): if api == "query_builder": q = QueryBuilder().row_range(row_range) received = lmdb_version_store_v1.read(symbol, query_builder=q).data + received.set_output_format(any_output_format) elif api == "read": received = lmdb_version_store_v1.read(symbol, row_range=row_range).data else: From 1e17ec4e3ae171d5256e82185b77a2b1e2828685 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:37:57 +0000 Subject: [PATCH 5/7] Updated test_query_builder_sparse.py and test_query_builder_batch.py to use any_output_format fixture Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../version_store/test_query_builder_batch.py | 22 ++++--- .../test_query_builder_sparse.py | 64 +++++++++++-------- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py b/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py index 911d5929e4..bbd8796213 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py @@ -13,13 +13,15 @@ from arcticdb_ext.storage import KeyType, NoDataFoundException from arcticdb.version_store.processing import QueryBuilder from arcticdb_ext.exceptions import InternalException, StorageException, UserInputException +from arcticdb.util.test import assert_frame_equal_with_arrow pytestmark = pytest.mark.pipeline -def test_filter_batch_one_query(lmdb_version_store_v1): +def test_filter_batch_one_query(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -33,12 +35,13 @@ def test_filter_batch_one_query(lmdb_version_store_v1): batch_res = lib.batch_read([sym1, sym2], query_builder=q) res1 = batch_res[sym1].data res2 = batch_res[sym2].data - assert np.array_equal(df1.query(pandas_query), res1) - assert np.array_equal(df2.query(pandas_query), res2) + assert_frame_equal_with_arrow(df1.query(pandas_query), res1) + assert_frame_equal_with_arrow(df2.query(pandas_query), res2) -def test_filter_batch_multiple_queries(lmdb_version_store_v1): +def test_filter_batch_multiple_queries(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -55,12 +58,13 @@ def test_filter_batch_multiple_queries(lmdb_version_store_v1): batch_res = lib.batch_read([sym1, sym2], query_builder=[q1, q2]) res1 = batch_res[sym1].data res2 = batch_res[sym2].data - assert np.array_equal(df1.query(pandas_query1), res1) - assert np.array_equal(df2.query(pandas_query2), res2) + assert_frame_equal_with_arrow(df1.query(pandas_query1), res1) + assert_frame_equal_with_arrow(df2.query(pandas_query2), res2) -def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1): +def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -74,8 +78,8 @@ def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1): batch_res = lib.batch_read([sym1, sym2], query_builder=[None, q2]) res1 = batch_res[sym1].data res2 = batch_res[sym2].data - assert np.array_equal(df1, res1) - assert np.array_equal(df2.query(pandas_query2), res2) + assert_frame_equal_with_arrow(df1, res1) + assert_frame_equal_with_arrow(df2.query(pandas_query2), res2) def test_filter_batch_incorrect_query_count(lmdb_version_store_v1): diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py b/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py index c746e28001..7a4361eaf4 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py @@ -12,7 +12,7 @@ import pytest from arcticdb.version_store.processing import QueryBuilder -from arcticdb.util.test import assert_frame_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow from arcticdb.util.hypothesis import use_of_function_scoped_fixtures_in_hypothesis_checked @@ -46,87 +46,99 @@ def write_test_data(self, lmdb_version_store): lib.compact_incomplete(self.sym, False, False, sparsify=True) self.df = pd.concat([df_0, df_1]) - def test_filter_isnull(self, lmdb_version_store): + def test_filter_isnull(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df[self.df["sparse1"].isnull()] q = QueryBuilder() q = q[q["sparse1"].isnull()] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_filter_notnull(self, lmdb_version_store): + def test_filter_notnull(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df[self.df["sparse1"].notnull()] q = QueryBuilder() q = q[q["sparse1"].notnull()] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_filter_col_equals_val(self, lmdb_version_store): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 == 1") q = QueryBuilder() q = q[q["sparse1"] == 1] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_filter_col_not_equals_val(self, lmdb_version_store): + def test_filter_col_not_equals_val(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 != 2") q = QueryBuilder() q = q[q["sparse1"] != 2] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_filter_col_isin_value_set(self, lmdb_version_store): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 in [1]") q = QueryBuilder() q = q[q["sparse1"].isin([1])] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_filter_col_isnotin_value_set(self, lmdb_version_store): + def test_filter_col_isnotin_value_set(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 not in [1]") q = QueryBuilder() q = q[q["sparse1"].isnotin([1])] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_filter_col_equals_col(self, lmdb_version_store): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 == sparse2") q = QueryBuilder() q = q[q["sparse1"] == q["sparse2"]] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_filter_col_not_equals_col(self, lmdb_version_store): + def test_filter_col_not_equals_col(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df.query("sparse1 != sparse2") q = QueryBuilder() q = q[q["sparse1"] != q["sparse2"]] received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_project_minus_col(self, lmdb_version_store): + lmdb_version_store.set_output_format(any_output_format) expected = self.df expected["projected"] = -expected["sparse1"] q = QueryBuilder() q = q.apply("projected", -q["sparse1"]) received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_project_col_plus_val(self, lmdb_version_store): + def test_project_col_plus_val(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) expected = self.df expected["projected"] = expected["sparse1"] + 1 q = QueryBuilder() q = q.apply("projected", q["sparse1"] + 1) received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_project_col_divided_by_col(self, lmdb_version_store): + lmdb_version_store.set_output_format(any_output_format) expected = self.df expected["projected"] = expected["sparse1"] / expected["sparse2"] q = QueryBuilder() q = q.apply("projected", q["sparse1"] / q["sparse2"]) received = lmdb_version_store.read(self.sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) - def test_groupby(self, lmdb_version_store): + def test_groupby(self, lmdb_version_store, any_output_format): + lmdb_version_store.set_output_format(any_output_format) aggs = { "sum": ("sparse2", "sum"), "min": ("sparse2", "min"), @@ -141,10 +153,11 @@ def test_groupby(self, lmdb_version_store): received = lmdb_version_store.read(self.sym, query_builder=q).data received = received.reindex(columns=sorted(received.columns)) received.sort_index(inplace=True) - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) def test_query_builder_sparse_dynamic_schema_type_change(lmdb_version_store_dynamic_schema): + lmdb_version_store.set_output_format(any_output_format) lib = lmdb_version_store_dynamic_schema sym = "test_query_builder_sparse_dynamic_schema_type_change" df_0 = pd.DataFrame( @@ -172,7 +185,7 @@ def test_query_builder_sparse_dynamic_schema_type_change(lmdb_version_store_dyna q = QueryBuilder() q = q[q["sparse1"].isnull()] received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @use_of_function_scoped_fixtures_in_hypothesis_checked @@ -186,9 +199,10 @@ def test_query_builder_sparse_dynamic_schema_type_change(lmdb_version_store_dyna ), ), ) -def test_query_builder_sparse_hypothesis(lmdb_version_store_v1, df): +def test_query_builder_sparse_hypothesis(lmdb_version_store_v1, df, any_output_format): assume(not df.empty and not df["sparse1"].isnull().all() and not df["sparse2"].isnull().all()) lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_query_builder_sparse_hypothesis" df.index = pd.date_range("2024-01-01", periods=len(df)) @@ -200,7 +214,7 @@ def test_query_builder_sparse_hypothesis(lmdb_version_store_v1, df): q = QueryBuilder() q = q[q["sparse1"].isnull()] received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Projection expected = df @@ -208,11 +222,11 @@ def test_query_builder_sparse_hypothesis(lmdb_version_store_v1, df): q = QueryBuilder() q = q.apply("projected", q["sparse1"] + q["sparse2"]) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Groupby + aggregation expected = df.groupby("sparse1").agg({"sparse2": "sum"}) q = QueryBuilder().groupby("sparse1").agg({"sparse2": "sum"}) received = lib.read(sym, query_builder=q).data received.sort_index(inplace=True) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) From 8130b4aa42fe5c6bbfcb6818f1a5ace75c968b7d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:41:16 +0000 Subject: [PATCH 6/7] Updated test_resample.py to use any_output_format fixture Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../arcticdb/version_store/test_resample.py | 157 +++++++++++++----- 1 file changed, 114 insertions(+), 43 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_resample.py b/python/tests/unit/arcticdb/version_store/test_resample.py index ef4e92a31b..c21e847a40 100644 --- a/python/tests/unit/arcticdb/version_store/test_resample.py +++ b/python/tests/unit/arcticdb/version_store/test_resample.py @@ -15,6 +15,7 @@ from arcticdb.exceptions import ArcticDbNotYetImplemented, SchemaException, UserInputException from arcticdb.util.test import ( assert_frame_equal, + assert_frame_equal_with_arrow, generic_resample_test, largest_numeric_type, common_sum_aggregation_dtype, @@ -29,6 +30,65 @@ pytestmark = pytest.mark.pipeline +def generic_resample_test_with_empty_buckets(lib, sym, rule, aggregations, date_range=None): + """ + Perform a resampling in ArcticDB and compare it against the same query in Pandas. + + This will remove all empty buckets mirroring ArcticDB's behavior. It cannot take additional parameters such as + orign and offset. In case such parameters are needed arcticdb.util.test.generic_resample_test can be used. + + This can drop buckets even all columns are of float type while generic_resample_test needs at least one non-float + column. + """ + # Pandas doesn't have a good date_range equivalent in resample, so just use read for that + expected = lib.read(sym, date_range=date_range).data + # Pandas 1.X needs None as the first argument to agg with named aggregators + expected = expected.groupby(partial(round, freq=rule)).agg(None, **aggregations) + expected = expected.reindex(columns=sorted(expected.columns)) + q = QueryBuilder() + q = q.resample(rule).agg(aggregations) + received = lib.read(sym, date_range=date_range, query_builder=q).data + received = received.reindex(columns=sorted(received.columns)) + + assert_frame_equal_with_arrow(expected, received, check_dtype=False) + + +def generic_resample_test_with_arrow_support( + lib, + sym, + rule, + aggregations, + data, + date_range=None, + closed=None, + label=None, + offset=None, + origin=None, + drop_empty_buckets_for=None, + expected_types=None, +): + """Wrapper around generic_resample_test that uses assert_frame_equal_with_arrow.""" + from arcticdb.util.test import generic_resample_test + + # Store the original assert_frame_equal + import arcticdb.util.test as test_module + original_assert = test_module.assert_frame_equal + + # Temporarily replace it with our arrow-compatible version + test_module.assert_frame_equal = assert_frame_equal_with_arrow + + try: + result = generic_resample_test_with_arrow_support( + lib, sym, rule, aggregations, data, date_range, closed, label, offset, origin, + drop_empty_buckets_for, expected_types + ) + finally: + # Restore the original function + test_module.assert_frame_equal = original_assert + + return result + + ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] def all_aggregations_dict(col): @@ -41,7 +101,7 @@ def round(t, freq): td = pd.Timedelta(freq) return pd.Timestamp((t.value // td.value) * td.value) -def generic_resample_test_with_empty_buckets(lib, sym, rule, aggregations, date_range=None): +def resample_test_with_any_output_format(lib, sym, rule, aggregations, date_range=None): """ Perform a resampling in ArcticDB and compare it against the same query in Pandas. @@ -61,15 +121,16 @@ def generic_resample_test_with_empty_buckets(lib, sym, rule, aggregations, date_ received = lib.read(sym, date_range=date_range, query_builder=q).data received = received.reindex(columns=sorted(received.columns)) - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) @pytest.mark.parametrize("freq", ("min", "h", "D", "1h30min")) @pytest.mark.parametrize("date_range", (None, (pd.Timestamp("2024-01-02T12:00:00"), pd.Timestamp("2024-01-03T12:00:00")))) @pytest.mark.parametrize("closed", ("left", "right")) @pytest.mark.parametrize("label", ("left", "right")) -def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label): +def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_resampling" # Want an index with data every minute for 2 days, with additional data points 1 nanosecond before and after each # minute to catch off-by-one errors @@ -84,7 +145,7 @@ def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label): df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, freq, @@ -105,8 +166,9 @@ def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label): @pytest.mark.parametrize("closed", ("left", "right")) -def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_store_v1, closed): +def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_store_v1, closed, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_resampling_duplicated_index_value_on_segment_boundary" # Will group on microseconds df_0 = pd.DataFrame({"col": np.arange(4)}, index=np.array([0, 1, 2, 1000], dtype="datetime64[ns]")) @@ -116,7 +178,7 @@ def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_stor lib.append(sym, df_1) lib.append(sym, df_2) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "us", @@ -137,7 +199,7 @@ def test_all_buckets_have_values(self, lmdb_version_store_v1): lib.write(sym, df) date_range = (dt.datetime(2023, 12, 7, 23, 59, 48), dt.datetime(2023, 12, 7, 23, 59, 52)) - generic_resample_test_with_empty_buckets(lib, sym, 's', {'high': ('mid', 'max')}, date_range=date_range) + resample_test_with_any_output_format(lib, sym, 's', {'high': ('mid', 'max')}, date_range=date_range) @pytest.mark.parametrize("closed", ("left", "right")) def test_first_bucket_is_empy(self, lmdb_version_store_v1, closed): @@ -155,7 +217,7 @@ def test_first_bucket_is_empy(self, lmdb_version_store_v1, closed): lib.write(sym, df) date_range = (dt.datetime(2023, 12, 7, 23, 59, 49), dt.datetime(2023, 12, 7, 23, 59, 50)) - generic_resample_test(lib, sym, 's', {'high': ('mid', 'max')}, df, date_range=date_range, closed=closed) + generic_resample_test_with_arrow_support(lib, sym, 's', {'high': ('mid', 'max')}, df, date_range=date_range, closed=closed) @pytest.mark.parametrize("closed", ("left", "right")) def test_last_bucket_is_empty(self, lmdb_version_store_v1, closed): @@ -174,7 +236,7 @@ def test_last_bucket_is_empty(self, lmdb_version_store_v1, closed): lib.write(sym, df) date_range = (dt.datetime(2023, 12, 7, 23, 59, 48), dt.datetime(2023, 12, 7, 23, 59, 49, 500000)) - generic_resample_test(lib, sym, 's', {'high': ('mid', 'max')}, df, date_range=date_range, closed=closed) + generic_resample_test_with_arrow_support(lib, sym, 's', {'high': ('mid', 'max')}, df, date_range=date_range, closed=closed) def test_inner_buckets_are_empty(self, lmdb_version_store_v1): lib = lmdb_version_store_v1 @@ -191,18 +253,19 @@ def test_inner_buckets_are_empty(self, lmdb_version_store_v1): lib.write(sym, df) date_range = (dt.datetime(2023, 12, 7, 23, 59, 48), dt.datetime(2023, 12, 7, 23, 59, 55)) - generic_resample_test_with_empty_buckets(lib, sym, 's', {'high': ('mid', 'max')}, date_range=date_range) + resample_test_with_any_output_format(lib, sym, 's', {'high': ('mid', 'max')}, date_range=date_range) -def test_resampling_timezones(lmdb_version_store_v1): +def test_resampling_timezones(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_resampling_timezones" # UK clocks go forward at 1am on March 31st in 2024 index = pd.date_range("2024-03-31T00:00:00", freq="min", periods=240, tz="Europe/London") df = pd.DataFrame({"col": np.arange(len(index))}, index=index) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "h", @@ -214,7 +277,7 @@ def test_resampling_timezones(lmdb_version_store_v1): index = pd.date_range("2024-10-27T00:00:00", freq="min", periods=240, tz="Europe/London") df = pd.DataFrame({"col": np.arange(len(index))}, index=index) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "h", @@ -223,8 +286,9 @@ def test_resampling_timezones(lmdb_version_store_v1): ) -def test_resampling_nan_correctness(version_store_factory): +def test_resampling_nan_correctness(version_store_factory, any_output_format): lib = version_store_factory( + lib.set_output_format(any_output_format) column_group_size=2, segment_row_size=2, dynamic_strings=True, @@ -278,11 +342,12 @@ def test_resampling_nan_correctness(version_store_factory): } ) - generic_resample_test(lib, sym, "us", agg_dict, df) + generic_resample_test_with_arrow_support(lib, sym, "us", agg_dict, df) -def test_resampling_bool_columns(lmdb_version_store_tiny_segment): +def test_resampling_bool_columns(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) sym = "test_resampling_bool_columns" idx = [0, 1, 1000, 1001, 2000, 2001, 3000, 3001] @@ -293,7 +358,7 @@ def test_resampling_bool_columns(lmdb_version_store_tiny_segment): df = pd.DataFrame({"col": col}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "us", @@ -310,8 +375,9 @@ def test_resampling_bool_columns(lmdb_version_store_tiny_segment): ) -def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_schema_v1): +def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) sym = "test_resampling_dynamic_schema_types_changing" # Will group on microseconds idx_0 = [0, 1, 2, 1000] @@ -326,7 +392,7 @@ def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_sch df_1 = pd.DataFrame({"col": col_1}, index=idx_1) lib.append(sym, df_1) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "us", @@ -343,8 +409,9 @@ def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_sch ) -def test_resampling_empty_bucket_in_range(lmdb_version_store_v1): +def test_resampling_empty_bucket_in_range(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_resampling_empty_bucket_in_range" # Group on microseconds, so bucket 1000-1999 will be empty idx = [0, 1, 2000, 2001] @@ -367,7 +434,7 @@ def test_resampling_empty_bucket_in_range(lmdb_version_store_v1): ) lib.write(sym, df) - generic_resample_test_with_empty_buckets( + resample_test_with_any_output_format( lib, sym, "us", @@ -383,7 +450,7 @@ def test_resampling_empty_bucket_in_range(lmdb_version_store_v1): ) -def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment): +def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment, any_output_format): # Covers a corner case where the date_range argument specifies that a row-slice is needed, but the bucket boundaries # mean that all of the index values required fall into a bucket being handled by the previous row-slice, and so # the call to ResampleClause::process produces a segment with no rows @@ -394,6 +461,7 @@ def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny # Therefore the only index value from the second row slice remaining to be processed is 3000ns. But this is outside # the specified date range, and so this call to ResampleClause::process produces a segment with no rows lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) sym = "test_resampling_row_slice_responsible_for_no_buckets" df = pd.DataFrame( { @@ -402,7 +470,7 @@ def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny index=[pd.Timestamp(0), pd.Timestamp(100), pd.Timestamp(200), pd.Timestamp(3000)], ) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "us", @@ -547,8 +615,9 @@ def test_resampling_sparse_data(lmdb_version_store_v1): lib.read(sym, query_builder=q) -def test_resampling_empty_type_column(lmdb_version_store_empty_types_v1): +def test_resampling_empty_type_column(lmdb_version_store_empty_types_v1, any_output_format): lib = lmdb_version_store_empty_types_v1 + lib.set_output_format(any_output_format) sym = "test_resampling_empty_type_column" lib.write(sym, pd.DataFrame({"col": ["hello"]}, index=[pd.Timestamp(0)])) @@ -577,7 +646,7 @@ def test_offset_smaller_than_freq(self, lmdb_version_store_v1, closed, offset): rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -595,7 +664,7 @@ def test_offset_larger_than_freq(self, lmdb_version_store_v1, closed, offset): rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -618,7 +687,7 @@ def test_values_on_offset_boundary(self, lmdb_version_store_v1, closed, offset): rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -646,7 +715,7 @@ def test_with_date_range(self, lmdb_version_store_v1, closed, date_range, offset rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -688,7 +757,7 @@ def test_origin(self, lmdb_version_store_v1, closed, origin): rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -717,7 +786,7 @@ def test_origin_is_multiple_of_freq(self, lmdb_version_store_v1, closed, origin, rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -744,7 +813,7 @@ def test_pre_epoch_data(self, lmdb_version_store_v1, closed, origin): rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -774,7 +843,7 @@ def test_origin_off_by_one_on_boundary(self, lmdb_version_store_v1, closed, orig rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -814,7 +883,7 @@ def test_epoch_and_ts_origin_works_with_date_range(self, lmdb_version_store_v1, rng = np.random.default_rng() df = pd.DataFrame({"col": rng.integers(0, 100, len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -839,8 +908,9 @@ def test_epoch_and_ts_origin_works_with_date_range(self, lmdb_version_store_v1, pd.Timestamp("2025-01-03 15:00:00") ]) @pytest.mark.parametrize("offset", ['10s', '13s', '2min']) -def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, offset): +def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, offset, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_origin_special_values" # Start and end are picked so that #bins * rule + start != end on purpose to test # the bin generation in case of end and end_day @@ -849,7 +919,7 @@ def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, of idx = pd.date_range(start, end, freq='10s') df = pd.DataFrame({"col": range(len(idx))}, index=idx) lib.write(sym, df) - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, "2min", @@ -881,8 +951,9 @@ def test_min_with_one_infinity_element(lmdb_version_store_v1): assert np.isneginf(lib.read(sym, query_builder=q).data['col_min'][0]) -def test_date_range_outside_symbol_timerange(lmdb_version_store_v1): +def test_date_range_outside_symbol_timerange(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_date_range_outside_symbol_timerange" df = pd.DataFrame({"col": np.arange(10)}, index=pd.date_range("2025-01-01", periods=10)) lib.write(sym, df) @@ -918,7 +989,7 @@ def test_aggregation_column_not_in_segment(self, lmdb_version_store_dynamic_sche "aggregated_last": dtype, "aggregated_count": np.uint64, } - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -955,7 +1026,7 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_first(self, lm "col_0_last": dtype, "col_0_count": np.uint64, } - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -992,7 +1063,7 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_second(self, l "col_0_count": np.uint64, } agg = {f"{name}_{op}": (name, op) for name in ["col_0"] for op in ALL_AGGREGATIONS} - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -1046,7 +1117,7 @@ def test_bucket_spans_two_segments(self, lmdb_version_store_dynamic_schema_v1, l "col_1_count": np.uint64, } agg = {f"{name}_{op}": (name, op) for name in ["col_0", "col_1"] for op in ALL_AGGREGATIONS} - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -1101,7 +1172,7 @@ def test_bucket_spans_three_segments(self, lmdb_version_store_dynamic_schema_v1, "col_1_count": np.uint64, } agg = {f"{name}_{op}": (name, op) for name in ["col_0", "col_1"] for op in ALL_AGGREGATIONS} - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -1185,7 +1256,7 @@ def test_middle_segment_does_not_contain_column(self, lmdb_version_store_dynamic columns_to_resample = ["to_resample"] agg = {f"{name}_{op}": (name, op) for name in columns_to_resample for op in ALL_AGGREGATIONS} expected_types = {f"{name}_{op}": expected_aggregation_type(op, df_list, name) for name in columns_to_resample for op in ALL_AGGREGATIONS} - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, @@ -1220,7 +1291,7 @@ def test_int_float_promotion(self, lmdb_version_store_dynamic_schema_v1): lib.append(sym, df) agg = {"to_resample_first": ("to_resample", "first")} expected_types = {"to_resample_first": np.float32} - generic_resample_test( + generic_resample_test_with_arrow_support( lib, sym, rule, From 88571845d60b489c0d186eed87523caa576adc32 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:42:56 +0000 Subject: [PATCH 7/7] Completed parameterization of all remaining pipeline tests for any_output_format Co-authored-by: IvoDD <5950454+IvoDD@users.noreply.github.com> --- .../version_store/test_basic_version_store.py | 27 +- .../test_filtering_hypothesis.py | 6 +- .../version_store/test_lazy_dataframe.py | 155 ++++++----- .../test_projection_hypothesis.py | 26 +- .../test_symbol_concatenation.py | 82 +++--- .../arcticdb/version_store/test_ternary.py | 245 ++++++++++-------- 6 files changed, 309 insertions(+), 232 deletions(-) diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py index 63c2359f0c..edc762eac2 100644 --- a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py +++ b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py @@ -40,6 +40,7 @@ sample_dataframe_only_strings, get_sample_dataframe, assert_frame_equal, + assert_frame_equal_with_arrow, assert_series_equal, config_context, distinct_timestamps, @@ -58,11 +59,11 @@ def symbol(): def assert_equal_value(data, expected): received = data.reindex(sorted(data.columns), axis=1) expected = expected.reindex(sorted(expected.columns), axis=1) - assert_frame_equal(received, expected) + assert_frame_equal_with_arrow(received, expected) def assert_equal(received, expected): - assert_frame_equal(received, expected) + assert_frame_equal_with_arrow(received, expected) assert received.equals(expected) @@ -170,7 +171,7 @@ def test_snapshot_names(object_version_store, snap): object_version_store.delete("sym") assert not object_version_store.has_symbol("sym") assert object_version_store.list_snapshots() == {snap: None} - assert_frame_equal(object_version_store.read("sym", as_of=snap).data, df) + assert_frame_equal_with_arrow(object_version_store.read("sym", as_of=snap).data, df) def test_empty_snapshot_name_not_allowed(object_version_store): @@ -226,7 +227,7 @@ def test_unhandled_chars_already_present_write(object_version_store, three_col_d object_version_store.write(sym, staged_data, parallel=True) object_version_store.compact_incomplete(sym, append=False, convert_int_to_float=False) - assert_frame_equal(object_version_store.read(sym).data, staged_data) + assert_frame_equal_with_arrow(object_version_store.read(sym).data, staged_data) @pytest.mark.parametrize("unhandled_char", [chr(127), chr(128)]) @@ -253,7 +254,7 @@ def test_unhandled_chars_already_present_on_deleted_symbol(object_version_store, else: object_version_store.write(sym, data) - assert_frame_equal(object_version_store.read(sym).data, data) + assert_frame_equal_with_arrow(object_version_store.read(sym).data, data) @pytest.mark.parametrize("unhandled_char", [chr(127), chr(128)]) @@ -825,7 +826,7 @@ def test_range_index(basic_store, sym): @pytest.mark.pipeline @pytest.mark.parametrize("use_date_range_clause", [True, False]) @pytest.mark.storage -def test_date_range(basic_store, use_date_range_clause): +def test_date_range(basic_store, use_date_range_clause, any_output_format): initial_timestamp = pd.Timestamp("2019-01-01") df = pd.DataFrame(data=np.arange(100), index=pd.date_range(initial_timestamp, periods=100)) sym = "date_test" @@ -874,6 +875,8 @@ def test_date_range(basic_store, use_date_range_clause): @pytest.mark.pipeline @pytest.mark.parametrize("use_date_range_clause", [True, False]) @pytest.mark.storage +def test_date_range_none(basic_store, use_date_range_clause): + basic_store.set_output_format(any_output_format) def test_date_range_none(basic_store, use_date_range_clause): sym = "date_test2" rows = 100 @@ -894,7 +897,7 @@ def test_date_range_none(basic_store, use_date_range_clause): @pytest.mark.pipeline @pytest.mark.parametrize("use_date_range_clause", [True, False]) @pytest.mark.storage -def test_date_range_start_equals_end(basic_store, use_date_range_clause): +def test_date_range_start_equals_end(basic_store, use_date_range_clause, any_output_format): sym = "date_test2" rows = 100 initial_timestamp = pd.Timestamp("2019-01-01") @@ -917,6 +920,8 @@ def test_date_range_start_equals_end(basic_store, use_date_range_clause): @pytest.mark.pipeline @pytest.mark.parametrize("use_date_range_clause", [True, False]) @pytest.mark.storage +def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause): + basic_store.set_output_format(any_output_format) def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause): lib = basic_store_tiny_segment sym = "test_date_range_row_sliced" @@ -2722,7 +2727,9 @@ def test_batch_append_with_throw_exception(basic_store, three_col_df): @pytest.mark.pipeline @pytest.mark.parametrize("use_date_range_clause", [True, False]) @pytest.mark.storage -def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause): +def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause, any_output_format): + lmdb_version_store = basic_store_tombstone_and_sync_passive + lmdb_version_store.set_output_format(any_output_format) lmdb_version_store = basic_store_tombstone_and_sync_passive symbols = [] for i in range(5): @@ -2978,7 +2985,7 @@ def test_dynamic_schema_read_columns(version_store_factory, lib_name, bucketize_ expected = pd.DataFrame({c: [column_data[c][0] if c in to_write else np.nan, append_column_data[c][0] if c in to_append else np.nan] for c in read_columns}) data.sort_index(inplace=True, axis=1) expected.sort_index(inplace=True, axis=1) - assert_frame_equal(data, expected) + assert_frame_equal_with_arrow(data, expected) lmdb_lib.delete("test") @@ -3228,7 +3235,7 @@ def find_expected_version(first_to_check): with pytest.raises(NoSuchVersionException): lib.read(symbol, as_of=timestamp) else: - assert_frame_equal(lib.read(symbol, as_of=timestamp).data, dataframes[expected_version_to_find]) + assert_frame_equal_with_arrow(lib.read(symbol, as_of=timestamp).data, dataframes[expected_version_to_find]) with config_context("VersionMap.ReloadInterval", timeout): # Write versions and keep track of time before and after writing diff --git a/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py b/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py index bc10c28356..fb3425e85e 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py @@ -198,8 +198,9 @@ def test_filter_string_empty_set_membership(lmdb_version_store_v1, df): df_dt=st.datetimes(min_value=datetime(2020, 1, 1), max_value=datetime(2022, 1, 1), timezones=timezone_st()), comparison_dt=st.datetimes(min_value=datetime(2020, 1, 1), max_value=datetime(2022, 1, 1), timezones=timezone_st()), ) -def test_filter_datetime_timezone_aware_hypothesis(lmdb_version_store_v1, df_dt, comparison_dt): +def test_filter_datetime_timezone_aware_hypothesis(lmdb_version_store_v1, df_dt, comparison_dt, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_datetime_timezone_aware_hypothesis" df = pd.DataFrame({"a": [df_dt]}) lib.write(symbol, df) @@ -299,9 +300,10 @@ def test_filter_more_columns_than_fit_in_one_segment(lmdb_version_store_tiny_seg ] ), ) -def test_filter_with_column_slicing(lmdb_version_store_tiny_segment, df): +def test_filter_with_column_slicing(lmdb_version_store_tiny_segment, df, any_output_format): assume(not df.empty) lib = lmdb_version_store_tiny_segment + lib.set_output_format(any_output_format) symbol = "test_filter_with_column_filtering" lib.write(symbol, df) q = QueryBuilder() diff --git a/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py b/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py index 1e741f6643..2bca8b25dc 100644 --- a/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py +++ b/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py @@ -11,14 +11,15 @@ import pytest from arcticdb import col, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest, where -from arcticdb.util.test import assert_frame_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow pytestmark = pytest.mark.pipeline -def test_lazy_read(lmdb_library): +def test_lazy_read(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_read" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -31,11 +32,12 @@ def test_lazy_read(lmdb_library): received = lazy_df.collect().data expected = lib.read(sym, as_of=0, date_range=(pd.Timestamp("2000-01-03"), pd.Timestamp("2000-01-07")), columns=["col2"]).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_date_range(lmdb_library): +def test_lazy_date_range(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_date_range" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -47,11 +49,12 @@ def test_lazy_date_range(lmdb_library): received = lazy_df.collect().data expected = df.iloc[1:9] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_filter(lmdb_library): +def test_lazy_filter(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_filter" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -63,11 +66,12 @@ def test_lazy_filter(lmdb_library): received = lazy_df.collect().data expected = df.query("col1 in [0, 3, 6, 9]") - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_head(lmdb_library): +def test_lazy_head(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_head" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -79,11 +83,12 @@ def test_lazy_head(lmdb_library): received = lazy_df.collect().data expected = df.iloc[2:4] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_tail(lmdb_library): +def test_lazy_tail(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_tail" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -95,11 +100,12 @@ def test_lazy_tail(lmdb_library): received = lazy_df.collect().data expected = df.iloc[6:8] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_apply(lmdb_library): +def test_lazy_apply(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_apply" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -112,11 +118,12 @@ def test_lazy_apply(lmdb_library): expected = df expected["new_col"] = expected["col1"] + expected["col2"] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_apply_inline_col(lmdb_library): +def test_lazy_apply_inline_col(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_apply_inline_col" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -128,11 +135,12 @@ def test_lazy_apply_inline_col(lmdb_library): expected = df expected["new_col"] = expected["col1"] + expected["col2"] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_project(lmdb_library): +def test_lazy_project(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_project" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -145,11 +153,12 @@ def test_lazy_project(lmdb_library): expected = df expected["new_col"] = expected["col1"] + expected["col2"] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_project_constant_value(lmdb_library): +def test_lazy_project_constant_value(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_project" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -162,11 +171,12 @@ def test_lazy_project_constant_value(lmdb_library): expected = df expected["new_col"] = 5 - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) -def test_lazy_ternary(lmdb_library): +def test_lazy_ternary(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_ternary" df = pd.DataFrame( { @@ -184,11 +194,12 @@ def test_lazy_ternary(lmdb_library): expected = df expected["new_col"] = np.where(df["conditional"].to_numpy(), df["col1"].to_numpy(), df["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_groupby(lmdb_library): +def test_lazy_groupby(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_groupby" df = pd.DataFrame({"col1": [0, 1, 0, 1, 2, 2], "col2": np.arange(6, dtype=np.int64)}) lib.write(sym, df) @@ -199,11 +210,12 @@ def test_lazy_groupby(lmdb_library): received.sort_index(inplace=True) expected = df.groupby("col1").agg({"col2": "sum"}) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_resample(lmdb_library): +def test_lazy_resample(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_resample" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -217,11 +229,12 @@ def test_lazy_resample(lmdb_library): expected.sort_index(inplace=True, axis=1) received.sort_index(inplace=True, axis=1) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_regex_match(lmdb_library, sym): +def test_lazy_regex_match(lmdb_library, sym, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["abc", "abcd", "aabc"], "b": [1, 2, 3]} @@ -234,11 +247,12 @@ def test_lazy_regex_match(lmdb_library, sym): received = lazy_df.collect().data expected = df[df.a.str.contains(pattern)] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_lazy_with_initial_query_builder(lmdb_library): +def test_lazy_with_initial_query_builder(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_chaining" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -253,11 +267,12 @@ def test_lazy_with_initial_query_builder(lmdb_library): expected = df.resample("us").agg({"col": "sum"}) expected["new_col"] = expected["col"] * 3 - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) -def test_lazy_chaining(lmdb_library): +def test_lazy_chaining(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_chaining" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -270,11 +285,12 @@ def test_lazy_chaining(lmdb_library): expected = df.resample("us").agg({"col": "sum"}) expected["new_col"] = expected["col"] * 3 - assert_frame_equal(expected, received, check_dtype=False) + assert_frame_equal_with_arrow(expected, received, check_dtype=False) -def test_lazy_batch_read(lmdb_library): +def test_lazy_batch_read(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym_0 = "test_lazy_batch_read_0" sym_1 = "test_lazy_batch_read_1" df = pd.DataFrame( @@ -296,12 +312,13 @@ def test_lazy_batch_read(lmdb_library): received = lazy_dfs.collect() expected_0 = lib.read(sym_0, as_of=0, date_range=(pd.Timestamp("2000-01-03"), pd.Timestamp("2000-01-07")), columns=["col2"]).data expected_1 = lib.read(sym_1).data - assert_frame_equal(expected_0, received[0].data) - assert_frame_equal(expected_1, received[1].data) + assert_frame_equal_with_arrow(expected_0, received[0].data) + assert_frame_equal_with_arrow(expected_1, received[1].data) -def test_lazy_batch_one_query(lmdb_library): +def test_lazy_batch_one_query(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_one_query_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -313,11 +330,12 @@ def test_lazy_batch_one_query(lmdb_library): received = lazy_dfs.collect() expected = df.query("col1 in [0, 3, 6, 9]") for vit in received: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) -def test_lazy_batch_collect_separately(lmdb_library): +def test_lazy_batch_collect_separately(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_collect_separately_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -334,13 +352,14 @@ def test_lazy_batch_collect_separately(lmdb_library): received_0 = lazy_df_0.collect().data received_1 = lazy_df_1.collect().data received_2 = lazy_df_2.collect().data - assert_frame_equal(expected_0, received_0) - assert_frame_equal(expected_1, received_1) - assert_frame_equal(expected_2, received_2) + assert_frame_equal_with_arrow(expected_0, received_0) + assert_frame_equal_with_arrow(expected_1, received_1) + assert_frame_equal_with_arrow(expected_2, received_2) -def test_lazy_batch_separate_queries_collect_together(lmdb_library): +def test_lazy_batch_separate_queries_collect_together(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_separate_queries_collect_together_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -355,13 +374,14 @@ def test_lazy_batch_separate_queries_collect_together(lmdb_library): expected_2 = df.query("col1 in [2, 4, 8]") received = LazyDataFrameCollection(lazy_dfs).collect() - assert_frame_equal(expected_0, received[0].data) - assert_frame_equal(expected_1, received[1].data) - assert_frame_equal(expected_2, received[2].data) + assert_frame_equal_with_arrow(expected_0, received[0].data) + assert_frame_equal_with_arrow(expected_1, received[1].data) + assert_frame_equal_with_arrow(expected_2, received[2].data) -def test_lazy_batch_complex(lmdb_library): +def test_lazy_batch_complex(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_complex_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -394,13 +414,14 @@ def test_lazy_batch_complex(lmdb_library): expected_2["shared_new_col_1"] = expected_2["col2"] * 2 expected_2["new_col"] = expected_2["col1"] * 2 expected_2["shared_new_col_2"] = expected_2["new_col"] + 10 - assert_frame_equal(expected_0, received[0].data) - assert_frame_equal(expected_1, received[1].data) - assert_frame_equal(expected_2, received[2].data) + assert_frame_equal_with_arrow(expected_0, received[0].data) + assert_frame_equal_with_arrow(expected_1, received[1].data) + assert_frame_equal_with_arrow(expected_2, received[2].data) -def test_lazy_collect_multiple_times(lmdb_library): +def test_lazy_collect_multiple_times(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_collect_multiple_times" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -410,19 +431,20 @@ def test_lazy_collect_multiple_times(lmdb_library): lazy_df = lib.read(sym, lazy=True).resample("us").agg({"col": "sum"}) expected = df.resample("us").agg({"col": "sum"}) received_0 = lazy_df.collect().data - assert_frame_equal(expected, received_0, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_0, check_dtype=False) received_1 = lazy_df.collect().data - assert_frame_equal(expected, received_1, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_1, check_dtype=False) lazy_df["new_col"] = lazy_df["col"] * 3 received_2 = lazy_df.collect().data expected["new_col"] = expected["col"] * 3 - assert_frame_equal(expected, received_2, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_2, check_dtype=False) -def test_lazy_batch_collect_multiple_times(lmdb_library): +def test_lazy_batch_collect_multiple_times(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_collect_multiple_times_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, index=pd.date_range("2000-01-01", periods=10) @@ -434,21 +456,22 @@ def test_lazy_batch_collect_multiple_times(lmdb_library): received_0 = lazy_dfs.collect() expected = df.query("col1 in [0, 3, 6, 9]") for vit in received_0: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) received_1 = lazy_dfs.collect() for vit in received_1: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) lazy_dfs = lazy_dfs[lazy_dfs["col1"].isin(0, 6)] received_2 = lazy_dfs.collect() expected = df.query("col1 in [0, 6]") for vit in received_2: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) -def test_lazy_collect_twice_with_date_range(lmdb_library): +def test_lazy_collect_twice_with_date_range(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_collect_twice_with_date_range" df = pd.DataFrame( { @@ -461,13 +484,14 @@ def test_lazy_collect_twice_with_date_range(lmdb_library): lazy_df = lib.read(sym, date_range=(pd.Timestamp("2000-01-03"), pd.Timestamp("2000-01-07")), lazy=True) expected = lib.read(sym, date_range=(pd.Timestamp("2000-01-03"), pd.Timestamp("2000-01-07"))).data received_0 = lazy_df.collect().data - assert_frame_equal(expected, received_0, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_0, check_dtype=False) received_1 = lazy_df.collect().data - assert_frame_equal(expected, received_1, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_1, check_dtype=False) -def test_lazy_pickling(lmdb_library): +def test_lazy_pickling(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) sym = "test_lazy_pickling" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -483,14 +507,15 @@ def test_lazy_pickling(lmdb_library): roundtripped = pickle.loads(pickle.dumps(lazy_df)) assert roundtripped == lazy_df received_initial = lazy_df.collect().data - assert_frame_equal(expected, received_initial, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_initial, check_dtype=False) received_roundtripped = roundtripped.collect().data - assert_frame_equal(expected, received_roundtripped, check_dtype=False) + assert_frame_equal_with_arrow(expected, received_roundtripped, check_dtype=False) -def test_lazy_batch_pickling(lmdb_library): +def test_lazy_batch_pickling(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) syms = [f"test_lazy_batch_pickling_{idx}" for idx in range(3)] idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -508,8 +533,8 @@ def test_lazy_batch_pickling(lmdb_library): assert roundtripped == lazy_dfs received_initial = lazy_dfs.collect() for vit in received_initial: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) received_roundtripped = roundtripped.collect() for vit in received_roundtripped: - assert_frame_equal(expected, vit.data) + assert_frame_equal_with_arrow(expected, vit.data) diff --git a/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py b/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py index acda7f5a67..07f11d9d1c 100644 --- a/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py +++ b/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py @@ -11,7 +11,7 @@ import pytest from arcticdb.version_store.processing import QueryBuilder -from arcticdb.util.test import assert_frame_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow from arcticdb.util.hypothesis import ( use_of_function_scoped_fixtures_in_hypothesis_checked, supported_numeric_dtypes, @@ -36,9 +36,10 @@ ), val=numeric_type_strategies(), ) -def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val): +def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val, any_output_format): assume(not df.empty) lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_numeric_binary_operation" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -66,7 +67,7 @@ def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val): df["c"] = pandas_lhs / pandas_rhs received = lib.read(symbol, query_builder=q).data try: - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) except AssertionError as e: original_df = lib.read(symbol).data print( @@ -79,22 +80,23 @@ def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val): @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_numeric_dtypes(), restrict_range=True)])) -def test_project_numeric_unary_operation(lmdb_version_store_v1, df): +def test_project_numeric_unary_operation(lmdb_version_store_v1, df, any_output_format): assume(not df.empty) lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_numeric_unary_operation" lib.write(symbol, df) q = QueryBuilder() q = q.apply("b", abs(q["a"])) df["b"] = abs(df["a"].astype(np.float64)) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) q = QueryBuilder() q = q.apply("b", -q["a"]) df["b"] = -(df["a"].astype(np.float64)) received = lib.read(symbol, query_builder=q).data try: - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) except AssertionError as e: original_df = lib.read(symbol).data print( @@ -121,9 +123,10 @@ def test_project_numeric_unary_operation(lmdb_version_store_v1, df): ), val=numeric_type_strategies(), ) -def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df, val): +def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df, val, any_output_format): assume(len(df) >= 3) lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_project_numeric_binary_operation_dynamic" lib.delete(symbol) slices = [ @@ -157,7 +160,7 @@ def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_sch df["c"] = pandas_lhs / pandas_rhs received = lib.read(symbol, query_builder=q).data try: - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) except AssertionError as e: original_df = lib.read(symbol).data print( @@ -171,9 +174,10 @@ def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_sch @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_floating_dtypes(), restrict_range=True)])) -def test_project_numeric_unary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df): +def test_project_numeric_unary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df, any_output_format): assume(len(df) >= 2) lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_project_numeric_unary_operation_dynamic" lib.delete(symbol) slices = [ @@ -187,9 +191,9 @@ def test_project_numeric_unary_operation_dynamic(lmdb_version_store_dynamic_sche q = q.apply("c", abs(q["a"])) df["c"] = abs(df["a"]) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(df, received, check_dtype=False) + assert_frame_equal_with_arrow(df, received, check_dtype=False) q = QueryBuilder() q = q.apply("c", -q["a"]) df["c"] = -df["a"] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(df, received, check_dtype=False) \ No newline at end of file + assert_frame_equal_with_arrow(df, received, check_dtype=False) \ No newline at end of file diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index bbaf2c70b2..f94695764b 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -12,7 +12,7 @@ from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest from arcticdb.exceptions import NoSuchVersionException, SchemaException from arcticdb.options import LibraryOptions -from arcticdb.util.test import assert_frame_equal, assert_series_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow, assert_series_equal from tests.util.mark import MACOS_WHEEL_BUILD, WINDOWS pytestmark = pytest.mark.pipeline @@ -23,8 +23,9 @@ @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, index, join): +def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, index, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -57,7 +58,7 @@ def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segm expected = pd.concat([df_0, df_1, df_2]) if index is None: expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received.data) + assert_frame_equal_with_arrow(expected, received.data) for idx, version in enumerate(received.versions): assert version.symbol == f"sym{idx}" assert version.version == 0 @@ -67,8 +68,9 @@ def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segm @pytest.mark.parametrize("first_type", ["uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float32", "float64"]) @pytest.mark.parametrize("second_type", ["uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float32", "float64"]) -def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type): +def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df0 = pd.DataFrame({"col": np.arange(1, dtype=np.dtype(first_type))}) df1 = pd.DataFrame({"col": np.arange(1, dtype=np.dtype(second_type))}) lib.write("sym0", df0) @@ -76,7 +78,7 @@ def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type): received = concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect().data expected = pd.concat([df0, df1]) expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize( @@ -108,8 +110,9 @@ def test_symbol_concat_with_series(lmdb_library_factory, index, name_0, name_1, @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_different_column_sets(lmdb_library_factory, dynamic_schema, columns_per_segment, join): +def test_symbol_concat_different_column_sets(lmdb_library_factory, dynamic_schema, columns_per_segment, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) # Use floats and strings so that our backfilling and Pandas' match df_0 = pd.DataFrame( { @@ -135,13 +138,14 @@ def test_symbol_concat_different_column_sets(lmdb_library_factory, dynamic_schem received = concat(lib.read_batch(["sym0", "sym1"], lazy=True), join=join).collect().data expected = pd.concat([df_0, df_1], join=join) expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) -def test_symbol_concat_integer_columns_outer_join(lmdb_library_factory, dynamic_schema, columns_per_segment): +def test_symbol_concat_integer_columns_outer_join(lmdb_library_factory, dynamic_schema, columns_per_segment, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.int64), @@ -168,12 +172,13 @@ def test_symbol_concat_integer_columns_outer_join(lmdb_library_factory, dynamic_ expected.index = pd.RangeIndex(len(expected)) expected.fillna(0, inplace=True) expected = expected.astype(np.int64) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join): +def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=True)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.float64), @@ -210,15 +215,16 @@ def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join received = concat(lib.read_batch(["sym0", "sym1"], lazy=True), join=join).collect().data expected = pd.concat([pd.concat([df_0, df_1], join="outer"), pd.concat([df_2, df_3], join="outer")], join=join) expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=5)]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_empty_column_intersection(lmdb_library_factory, dynamic_schema, columns_per_segment, index, join): +def test_symbol_concat_empty_column_intersection(lmdb_library_factory, dynamic_schema, columns_per_segment, index, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.float64), @@ -247,7 +253,7 @@ def test_symbol_concat_empty_column_intersection(lmdb_library_factory, dynamic_s expected = pd.concat([df_0, df_1], join=join) if index is None: expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @@ -255,8 +261,9 @@ def test_symbol_concat_empty_column_intersection(lmdb_library_factory, dynamic_s @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns", [["col1"], ["col2"], ["col3"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_column_slicing(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, columns, join): +def test_symbol_concat_column_slicing(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, columns, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -281,13 +288,14 @@ def test_symbol_concat_column_slicing(lmdb_library_factory, dynamic_schema, rows received = concat([lazy_df_0, lazy_df_1], join).collect().data expected = pd.concat([df_0.loc[:, columns], df_1.loc[:, columns]]) expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dynamic_schema, join): +def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dynamic_schema, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -315,13 +323,14 @@ def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dyn print(received) expected = pd.concat([df_0.loc[:, columns], df_1.loc[:, columns]]) expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("only_incompletes", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes, join): +def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes, join, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) if not only_incompletes: df_0 = pd.DataFrame({"col1": np.arange(3, dtype=np.float64), "col2": np.arange(3, 6, dtype=np.float64)}, index=pd.date_range("2025-01-01", periods=3)) lib.write("sym0", df_0) @@ -344,7 +353,7 @@ def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes expected = pd.concat([df_1, df_2], join=join) else: expected = pd.concat([df_0, df_1, df_2], join=join) - assert_frame_equal(expected, received.data) + assert_frame_equal_with_arrow(expected, received.data) for idx, version in enumerate(received.versions): assert version.symbol == f"sym{idx}" assert version.data is None @@ -357,8 +366,9 @@ def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_multiindex_basic(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join): +def test_symbol_concat_multiindex_basic(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df = pd.DataFrame( { "col1": np.arange(12, dtype=np.int64), @@ -372,12 +382,13 @@ def test_symbol_concat_multiindex_basic(lmdb_library_factory, dynamic_schema, ro lib.write("sym2", df[7:]) received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True), join).collect().data - assert_frame_equal(df, received) + assert_frame_equal_with_arrow(df, received) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_with_date_range(lmdb_library, join): +def test_symbol_concat_with_date_range(lmdb_library, join, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -401,15 +412,16 @@ def test_symbol_concat_with_date_range(lmdb_library, join): received = concat([lazy_df_0, lazy_df_1], join).collect().data expected = pd.concat([df_0[:2], df_1[1:]]) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_complex(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join): +def test_symbol_concat_complex(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -450,11 +462,12 @@ def test_symbol_concat_complex(lmdb_library_factory, dynamic_schema, rows_per_se received = lazy_df.collect().data received = received.reindex(columns=sorted(received.columns)) expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_symbol_concat_querybuilder_syntax(lmdb_library): +def test_symbol_concat_querybuilder_syntax(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -493,13 +506,14 @@ def test_symbol_concat_querybuilder_syntax(lmdb_library): received = received.reindex(columns=sorted(received.columns)) expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("index_name_0", [None, "ts1", "ts2"]) @pytest.mark.parametrize("index_name_1", [None, "ts1", "ts2"]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, index_name_1, join): +def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, index_name_1, join, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df_0 = pd.DataFrame({"col1": np.arange(1, dtype=np.float64), "col2": np.arange(1, 2, dtype=np.float64)}, index=[pd.Timestamp(0)]) df_1 = pd.DataFrame({"col1": np.arange(2, 3, dtype=np.float64), "col3": np.arange(3, 4, dtype=np.float64)}, index=[pd.Timestamp(1)]) df_0.index.name = index_name_0 @@ -509,7 +523,7 @@ def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, received = concat(lib.read_batch(["sym0", "sym1"], lazy=True), join).collect().data expected = pd.concat([df_0, df_1], join=join) expected.index.name = index_name_0 if index_name_0 == index_name_1 else None - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("index_name_0_level_0", [None, "ts1", "ts2"]) @@ -517,7 +531,7 @@ def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, @pytest.mark.parametrize("index_name_1_level_0", [None, "ts1", "ts2"]) @pytest.mark.parametrize("index_name_1_level_1", [None, "hello", "goodbye"]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_differently_named_multiindexes( +def test_symbol_concat_differently_named_multiindexes(, any_output_format): lmdb_library, index_name_0_level_0, index_name_0_level_1, @@ -526,6 +540,7 @@ def test_symbol_concat_differently_named_multiindexes( join ): lib = lmdb_library + lib.set_output_format(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(1, dtype=np.float64), @@ -547,7 +562,7 @@ def test_symbol_concat_differently_named_multiindexes( expected_level_0_name = index_name_0_level_0 if index_name_0_level_0 == index_name_1_level_0 else None expected_level_1_name = index_name_0_level_1 if index_name_0_level_1 == index_name_1_level_1 else None expected.index.names = [expected_level_0_name, expected_level_1_name] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("tz_0", [None, "Europe/Amsterdam", "US/Eastern"]) @@ -673,8 +688,9 @@ def test_symbol_concat_pickled_data(lmdb_library): concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect() -def test_symbol_concat_docstring_example(lmdb_library): +def test_symbol_concat_docstring_example(lmdb_library, any_output_format): lib = lmdb_library + lib.set_output_format(any_output_format) df0 = pd.DataFrame( { "col": [0, 1, 2, 3, 4], @@ -695,4 +711,4 @@ def test_symbol_concat_docstring_example(lmdb_library): lazy_df = concat([lazy_df0, lazy_df1]) lazy_df = lazy_df.resample("10min").agg({"col": "sum"}) received = lazy_df.collect().data - assert_frame_equal(pd.DataFrame({"col": [14]}, index=[pd.Timestamp("2025-01-01")]), received) \ No newline at end of file + assert_frame_equal_with_arrow(pd.DataFrame({"col": [14]}, index=[pd.Timestamp("2025-01-01")]), received) \ No newline at end of file diff --git a/python/tests/unit/arcticdb/version_store/test_ternary.py b/python/tests/unit/arcticdb/version_store/test_ternary.py index 44e37a2def..c38a2a9542 100644 --- a/python/tests/unit/arcticdb/version_store/test_ternary.py +++ b/python/tests/unit/arcticdb/version_store/test_ternary.py @@ -16,7 +16,7 @@ from arcticdb import QueryBuilder, where from arcticdb_ext.exceptions import InternalException, SchemaException, UserInputException from arcticdb.util.hypothesis import use_of_function_scoped_fixtures_in_hypothesis_checked -from arcticdb.util.test import assert_frame_equal +from arcticdb.util.test import assert_frame_equal, assert_frame_equal_with_arrow from tests.util.mark import WINDOWS @@ -28,8 +28,9 @@ # holding those particular types -def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v1): +def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_condition_as_full_and_empty_result" df = pd.DataFrame( { @@ -48,18 +49,19 @@ def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v q = QueryBuilder() q = q.apply("new_col", where(~(q["conditional"] != 0), q["col1"], q["col2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # EmptyResult expected["new_col"] = np.where((df["conditional"] != 0).to_numpy(), df["col1"].to_numpy(), df["col2"].to_numpy()) q = QueryBuilder() q = q.apply("new_col", where(q["conditional"] != 0, q["col1"], q["col2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_column_column_numeric(lmdb_version_store_v1): +def test_project_ternary_column_column_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_column_column_numeric" df = pd.DataFrame( { @@ -80,7 +82,7 @@ def test_project_ternary_column_column_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["int64_1"], q["int64_2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # One type a subset of the other expected = copy.deepcopy(df) @@ -88,7 +90,7 @@ def test_project_ternary_column_column_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["int64_1"], q["int8"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Promotable type exists expected = copy.deepcopy(df) @@ -96,7 +98,7 @@ def test_project_ternary_column_column_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["int8"], q["uint8"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # uint64/int64 mix expected = copy.deepcopy(df) @@ -104,11 +106,12 @@ def test_project_ternary_column_column_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["int8"], q["uint64"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1): +def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_column_column_dynamic_strings" df = pd.DataFrame( { @@ -125,7 +128,7 @@ def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["col1"], q["col2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.skipif(WINDOWS, reason="We do not support fixed-width strings on Windows") @@ -155,8 +158,9 @@ def test_project_ternary_fixed_width_strings(version_store_factory): lib.read(symbol, query_builder=q) -def test_project_ternary_column_value_numeric(lmdb_version_store_v1): +def test_project_ternary_column_value_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_column_value_numeric" df = pd.DataFrame( { @@ -172,18 +176,19 @@ def test_project_ternary_column_value_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["col1"], 10)) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Swap operands expected["new_col"] = np.where(df["conditional"].to_numpy(), 10, df["col1"].to_numpy()) q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], 10, q["col1"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_column_value_strings(lmdb_version_store_v1): +def test_project_ternary_column_value_strings(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_column_value_strings" df = pd.DataFrame( { @@ -199,18 +204,19 @@ def test_project_ternary_column_value_strings(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["col1"], "h")) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Swap operands expected["new_col"] = np.where(df["conditional"].to_numpy(), "h", df["col1"].to_numpy()) q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], "h", q["col1"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_value_value_numeric(lmdb_version_store_v1): +def test_project_ternary_value_value_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_value_value_numeric" df = pd.DataFrame( { @@ -226,11 +232,12 @@ def test_project_ternary_value_value_numeric(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], 0, 1)) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_value_value_string(lmdb_version_store_v1): +def test_project_ternary_value_value_string(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_value_value_string" df = pd.DataFrame( { @@ -245,7 +252,7 @@ def test_project_ternary_value_value_string(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], "hello", "goodbye")) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize( @@ -258,10 +265,11 @@ def test_project_ternary_value_value_string(lmdb_version_store_v1): ) ] ) -def test_project_ternary_column_sliced(version_store_factory, index): +def test_project_ternary_column_sliced(version_store_factory, index, any_output_format): # Cannot use lmdb_version_store_tiny_segment as it has fixed-width strings, which are not supported with the ternary # operator lib = version_store_factory(dynamic_strings=True, column_group_size=2, segment_row_size=2) + lib.set_output_format(any_output_format) symbol = "test_project_ternary_column_sliced_range_index" # This fixture has 2 columns per slice, so the column groups will be: # - ["conditional", num_1] @@ -287,7 +295,7 @@ def test_project_ternary_column_sliced(version_store_factory, index): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["num_1"], q["num_2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # String expected = df @@ -295,11 +303,12 @@ def test_project_ternary_column_sliced(version_store_factory, index): q = QueryBuilder() q = q.apply("new_col", where(q["conditional"], q["str_1"], q["str_2"])) received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1): +def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_dynamic_missing_columns" all_columns_df = pd.DataFrame( { @@ -329,7 +338,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col1"] = expected["col1"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), 100) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing with value update_df = base_update_df.drop(columns="col2") @@ -340,7 +349,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col2"] = expected["col2"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), 100, expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional column missing update_df = base_update_df.drop(columns="conditional") @@ -350,7 +359,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]).fillna(False) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # left column missing with column update_df = base_update_df.drop(columns="col1") @@ -359,7 +368,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col1"] = expected["col1"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing with column update_df = base_update_df.drop(columns="col2") @@ -368,7 +377,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col2"] = expected["col2"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and left columns missing update_df = base_update_df.drop(columns=["conditional", "col1"]) @@ -379,7 +388,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected["col1"].fillna(0, inplace=True) expected["col1"] = expected["col1"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and right columns missing update_df = base_update_df.drop(columns=["conditional", "col2"]) @@ -390,7 +399,7 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected["col2"].fillna(0, inplace=True) expected["col2"] = expected["col2"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # left and right columns missing update_df = base_update_df.drop(columns=["col1", "col2"]) @@ -400,11 +409,12 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche expected["col1"] = expected["col1"].astype("int64") expected["col2"] = expected["col2"].astype("int64") expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dynamic_schema_v1): +def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_project_ternary_dynamic_missing_columns_strings" all_columns_df = pd.DataFrame( { @@ -433,7 +443,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), "e") - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing with value update_df = base_update_df.drop(columns="col2") @@ -443,7 +453,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]) expected["new_col"] = np.where(expected["conditional"].to_numpy(), "e", expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional column missing update_df = base_update_df.drop(columns="conditional") @@ -453,7 +463,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]).fillna(False) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # left column missing with column update_df = base_update_df.drop(columns="col1") @@ -461,7 +471,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing with column update_df = base_update_df.drop(columns="col2") @@ -469,7 +479,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and left columns missing update_df = base_update_df.drop(columns=["conditional", "col1"]) @@ -478,7 +488,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna expected = pd.concat([all_columns_df, update_df]) expected["conditional"].fillna(False, inplace=True) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and right columns missing update_df = base_update_df.drop(columns=["conditional", "col2"]) @@ -487,7 +497,7 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna expected = pd.concat([all_columns_df, update_df]) expected["conditional"].fillna(False, inplace=True) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # left and right columns missing update_df = base_update_df.drop(columns=["col1", "col2"]) @@ -495,11 +505,12 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]) expected["new_col"] = np.where(expected["conditional"].to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_sparse_col_val(lmdb_version_store_v1): +def test_project_ternary_sparse_col_val(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_project_ternary_sparse_col_val" df = pd.DataFrame( { @@ -517,14 +528,14 @@ def test_project_ternary_sparse_col_val(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition"] == 1.0, q["col"], 5)) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Dense output expected = df expected["projected"] = np.where(expected["col"].notnull().to_numpy(), expected["col"].to_numpy(), 5.0) q = QueryBuilder() q = q.apply("projected", where(q["col"].notnull(), q["col"], 5)) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Val/col # Sparse output @@ -533,18 +544,19 @@ def test_project_ternary_sparse_col_val(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition"] == 1.0, 5, q["col"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Dense output expected = df expected["projected"] = np.where((expected["col"].isnull()).to_numpy(), 5.0, expected["col"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["col"].isnull(), 5.0, q["col"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_sparse_col_col(lmdb_version_store_v1): +def test_project_ternary_sparse_col_col(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_project_ternary_sparse_col_col" df = pd.DataFrame( { @@ -565,28 +577,28 @@ def test_project_ternary_sparse_col_col(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition1"] == 1.0, q["col1"], q["col2"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Left input sparse, right input dense expected = df expected["projected"] = np.where((expected["condition1"] == 1.0).to_numpy(), expected["col1"].to_numpy(), expected["condition2"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["condition1"] == 1.0, q["col1"], q["condition2"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Left input dense, right input sparse expected = df expected["projected"] = np.where((expected["condition1"] == 1.0).to_numpy(), expected["condition2"].to_numpy(), expected["col2"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["condition1"] == 1.0, q["condition2"], q["col2"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Both inputs dense expected = df expected["projected"] = np.where((expected["condition1"] == 1.0).to_numpy(), expected["condition2"].to_numpy(), expected["condition2"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["condition1"] == 1.0, q["condition2"], q["condition2"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Dense output expected = df @@ -594,7 +606,7 @@ def test_project_ternary_sparse_col_col(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition2"] == 0.0, q["col1"], q["!col1"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty output expected = df @@ -602,11 +614,12 @@ def test_project_ternary_sparse_col_col(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition2"] == 1.0, q["col1"], q["!col1"])) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_project_ternary_condition_empty(lmdb_version_store_v1): +def test_project_ternary_condition_empty(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) sym = "test_project_ternary_condition_empty" df = pd.DataFrame({"condition": [0.0, 0.0, 0.0], "col1": [0.0, np.nan, np.nan], "col2": [0.0, np.nan, np.nan]}, index=pd.date_range("2024-01-01", periods=3)) lib.write(sym, df, sparsify_floats=True) @@ -615,11 +628,12 @@ def test_project_ternary_condition_empty(lmdb_version_store_v1): q = QueryBuilder() q = q.apply("projected", where(q["condition"].isnull(), q["col1"], np.float64(2000))) received = lib.read(sym, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_bitset_bitset(lmdb_version_store_v1): +def test_filter_ternary_bitset_bitset(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_bitset_bitset" df = pd.DataFrame( { @@ -635,11 +649,12 @@ def test_filter_ternary_bitset_bitset(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, q["col2"] == 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_bitset_column(lmdb_version_store_v1): +def test_filter_ternary_bitset_column(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_bitset_column" df = pd.DataFrame( { @@ -655,17 +670,18 @@ def test_filter_ternary_bitset_column(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, q["col2"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), df["col2"].to_numpy(), (df["col1"] < 4).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col2"], q["col1"] < 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_bool_columns(lmdb_version_store_v1): +def test_filter_ternary_bool_columns(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_bool_columns" df = pd.DataFrame( { @@ -681,29 +697,30 @@ def test_filter_ternary_bool_columns(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"], q["col2"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), df["col2"].to_numpy(), df["col1"].to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col2"], q["col1"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), df["col1"].to_numpy(), True)] q = QueryBuilder() q = q[where(q["conditional"], q["col1"], True)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), False, df["col2"].to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], False, q["col2"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_bitset_value(lmdb_version_store_v1): +def test_filter_ternary_bitset_value(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_bitset_value" df = pd.DataFrame( { @@ -718,29 +735,30 @@ def test_filter_ternary_bitset_value(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, False)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), (df["col1"] < 4).to_numpy(), True)] q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, True)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), False, (df["col1"] < 4).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], False, q["col1"] < 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) expected = df[np.where(df["conditional"].to_numpy(), True, (df["col1"] < 4).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], True, q["col1"] < 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1): +def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_bitset_full_and_empty_results" df = pd.DataFrame( { @@ -756,32 +774,33 @@ def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, q["col1"] < 0)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty result as left operand expected = df[np.where(df["conditional"].to_numpy(), (df["col1"] < 0).to_numpy(), (df["col1"] < 4).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 0, q["col1"] < 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as right operand expected = df[np.where(df["conditional"].to_numpy(), (df["col1"] < 4).to_numpy(), (~(df["col1"] < 0)).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col1"] < 4, ~(q["col1"] < 0))] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as left operand expected = df[np.where(df["conditional"].to_numpy(), (~(df["col1"] < 0)).to_numpy(), (df["col1"] < 4).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], ~(q["col1"] < 0), q["col1"] < 4)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1): +def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_column_full_and_empty_results" df = pd.DataFrame( { @@ -798,33 +817,34 @@ def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], q["col1"], q["col2"] < 0)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty result as left operand expected = df[np.where(df["conditional"].to_numpy(), (df["col2"] < 0).to_numpy(), df["col1"].to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col2"] < 0, q["col1"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as right operand expected = df[np.where(df["conditional"].to_numpy(), df["col1"].to_numpy(), (~(df["col2"] < 0)).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col1"], ~(q["col2"] < 0))] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as left operand expected = df[np.where(df["conditional"].to_numpy(), (~(df["col2"] < 0)).to_numpy(), df["col1"].to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], ~(q["col2"] < 0), q["col1"])] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) @pytest.mark.parametrize("value", [True, False]) -def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, value): +def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, value, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_value_full_and_empty_results" df = pd.DataFrame( { @@ -840,32 +860,33 @@ def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, valu q = QueryBuilder() q = q[where(q["conditional"], value, q["col2"] < 0)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty result as left operand expected = df[np.where(df["conditional"].to_numpy(), (df["col2"] < 0).to_numpy(), value)] q = QueryBuilder() q = q[where(q["conditional"], q["col2"] < 0, value)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as right operand expected = df[np.where(df["conditional"].to_numpy(), value, (~(df["col2"] < 0)).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], value, ~(q["col2"] < 0))] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full result as left operand expected = df[np.where(df["conditional"].to_numpy(), (~(df["col2"] < 0)).to_numpy(), value)] q = QueryBuilder() q = q[where(q["conditional"], ~(q["col2"] < 0), value)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) -def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1): +def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_full_and_empty_results_squared" df = pd.DataFrame( { @@ -881,28 +902,28 @@ def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1): q = QueryBuilder() q = q[where(q["conditional"], ~(q["col2"] < 0), ~(q["col2"] < 0))] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Full/Empty expected = df[np.where(df["conditional"].to_numpy(), (~(df["col2"] < 0)).to_numpy(), (df["col2"] < 0).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], ~(q["col2"] < 0), q["col2"] < 0)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty/Full expected = df[np.where(df["conditional"].to_numpy(), (df["col2"] < 0).to_numpy(), (~(df["col2"] < 0)).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col2"] < 0, ~(q["col2"] < 0))] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # Empty/Empty expected = df[np.where(df["conditional"].to_numpy(), (df["col2"] < 0).to_numpy(), (df["col2"] < 0).to_numpy())] q = QueryBuilder() q = q[where(q["conditional"], q["col2"] < 0, q["col2"] < 0)] received = lib.read(symbol, query_builder=q).data - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) def test_filter_ternary_invalid_conditions(lmdb_version_store_v1): @@ -984,8 +1005,9 @@ def test_filter_ternary_pythonic_syntax(): q[q["col1"] if q["conditional"] else q["col2"]] -def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1): +def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib.set_output_format(any_output_format) symbol = "test_filter_ternary_dynamic_missing_columns" all_columns_df = pd.DataFrame( { @@ -1015,7 +1037,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col1"] = expected["col1"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), True)] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing with value update_df = base_update_df.drop(columns="col2") @@ -1026,7 +1048,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col2"] = expected["col2"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), False, (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional column missing update_df = base_update_df.drop(columns="conditional") @@ -1036,7 +1058,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem received = lib.read(symbol, query_builder=q).data expected = pd.concat([all_columns_df, update_df]).fillna(False) expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # left column missing update_df = base_update_df.drop(columns="col1") @@ -1045,7 +1067,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col1"] = expected["col1"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # right column missing update_df = base_update_df.drop(columns="col2") @@ -1054,7 +1076,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected = pd.concat([all_columns_df, update_df]).fillna(0) expected["col2"] = expected["col2"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and left column missing update_df = base_update_df.drop(columns=["conditional", "col1"]) @@ -1065,7 +1087,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected["col1"].fillna(0, inplace=True) expected["col1"] = expected["col1"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # conditional and right column missing update_df = base_update_df.drop(columns=["conditional", "col2"]) @@ -1076,7 +1098,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected["col2"].fillna(0, inplace=True) expected["col2"] = expected["col2"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received,) + assert_frame_equal_with_arrow(expected, received,) # left and right column missing update_df = base_update_df.drop(columns=["col1", "col2"]) @@ -1086,7 +1108,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem expected["col1"] = expected["col1"].astype("int64") expected["col2"] = expected["col2"].astype("int64") expected = expected[np.where(expected["conditional"].to_numpy(), (expected["col1"] == 1).to_numpy(), (expected["col2"] == 12).to_numpy())] - assert_frame_equal(expected, received) + assert_frame_equal_with_arrow(expected, received) # TODO: Assert that the projected column is of type float64 after modify_schema change is merged @@ -1101,9 +1123,10 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem ), ), ) -def test_ternary_hypothesis(lmdb_version_store_v1, df): +def test_ternary_hypothesis(lmdb_version_store_v1, df, any_output_format): assume(not df.empty and not df["condition"].isnull().all() and not df["col1"].isnull().all() and not df["col2"].isnull().all()) lib = lmdb_version_store_v1 + lib.set_output_format(any_output_format) dense_sym = "test_ternary_hypothesis_dense" sparse_sym = "test_ternary_hypothesis_sparse" @@ -1118,29 +1141,29 @@ def test_ternary_hypothesis(lmdb_version_store_v1, df): expected["projected"] = np.where(expected["condition"].isnull().to_numpy(), expected["col1"].to_numpy(), expected["col2"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["condition"].isnull(), q["col1"], q["col2"])) - assert_frame_equal(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) - assert_frame_equal(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) # col/val expected = df.copy(deep=True) expected["projected"] = np.where(expected["condition"].isnull().to_numpy(), expected["col1"].to_numpy(), 2000.0) q = QueryBuilder() q = q.apply("projected", where(q["condition"].isnull(), q["col1"], 2000)) - assert_frame_equal(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) - assert_frame_equal(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) # val/col expected = df.copy(deep=True) expected["projected"] = np.where(expected["condition"].isnull().to_numpy(), 2000.0, expected["col2"].to_numpy()) q = QueryBuilder() q = q.apply("projected", where(q["condition"].isnull(), 2000, q["col2"])) - assert_frame_equal(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) - assert_frame_equal(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) # val/val expected = df.copy(deep=True) expected["projected"] = np.where(expected["condition"].isnull().to_numpy(), 2000.0, 3000.0) q = QueryBuilder() q = q.apply("projected", where(q["condition"].isnull(), 2000, 3000)) - assert_frame_equal(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) - assert_frame_equal(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) # Filters # Only test col/col, col/val etc can be achieved more efficiently without using the ternary operator @@ -1148,5 +1171,5 @@ def test_ternary_hypothesis(lmdb_version_store_v1, df): expected = expected[np.where(expected["condition"].isnull().to_numpy(), expected["col1"].isnull().to_numpy(), expected["col2"].isnull().to_numpy())] q = QueryBuilder() q = q[where(q["condition"].isnull(), q["col1"].isnull(), q["col2"].isnull())] - assert_frame_equal(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) - assert_frame_equal(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(dense_sym, query_builder=q).data, check_dtype=False) + assert_frame_equal_with_arrow(expected, lib.read(sparse_sym, query_builder=q).data, check_dtype=False)