From 4f3fdfd895b11637750b78b67d6f1b6d5fe05f2e Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Thu, 2 Oct 2025 12:21:53 +0300 Subject: [PATCH 1/3] Fix resample empty bucket with date range In some fairly niche cases processing pipeline can produce empty segments: - Resampling with dynamic schema - date range filter with no index values within an intersecting data key (fixed in a previous PR) We previously tried to allocate a 0 sized memory block which was raising assertions failures. This PR just skips allocating the 0 sized memory blocks and adds an arrow test to verify it works. --- cpp/arcticdb/pipeline/read_frame.cpp | 16 +++++++++----- .../unit/arcticdb/version_store/test_arrow.py | 21 +++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index 3f3ce29043..9a3ce9fd54 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -107,11 +107,17 @@ SegmentInMemory allocate_chunked_frame(const std::shared_ptr& c }; auto handlers = TypeHandlerRegistry::instance(); - if (row_count > 0) { - for (auto& column : output.columns()) { - auto handler = handlers->get_handler(output_format, column->type()); - const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL); - for (auto block_row_count : block_row_counts) { + for (auto& column : output.columns()) { + auto handler = handlers->get_handler(output_format, column->type()); + const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL); + for (auto block_row_count : block_row_counts) { + if (block_row_count > 0) { + // We can end up with empty segments from the processing pipeline, e.g. when: + // - Filtering a data key to the empty set (e.g. date_range = (3, 3) in a data key with no index=3) + // - Resampling with a date range with a bucket slice containing no indices + // 0 sized memory blocks would break the offset assumptions in chunked buffers, and it is fine to have + // number of memory blocks not equal number of segments because follow-up methods like + // `copy_frame_data_to_buffer` rely on offsets rather than block indices. const auto bytes = block_row_count * data_size; column->allocate_data(bytes); column->advance_data(bytes); diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py index bd3e90a62b..91efb06116 100644 --- a/python/tests/unit/arcticdb/version_store/test_arrow.py +++ b/python/tests/unit/arcticdb/version_store/test_arrow.py @@ -904,3 +904,24 @@ def gen_df(start, num_rows, with_columns=True): assert pc.count(table.column("count_col"), mode="only_null").as_py() == 4 expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data assert_frame_equal_with_arrow(table, expected) + + +def test_resample_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment): + # Closely mimics test_resampling_row_slice_responsible_for_no_buckets with arrow from test_resample.py + # TODO: Remove this test if we enable pipeline tests with arrow + lib = lmdb_version_store_tiny_segment + lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + sym = "sym" + df = pd.DataFrame( + { + "to_sum": [1, 2, 3, 4], + }, + index=[pd.Timestamp(0), pd.Timestamp(100), pd.Timestamp(200), pd.Timestamp(3000)], + ) + lib.write(sym, df) + + q = QueryBuilder().resample("us").agg({"to_sum": ("to_sum", "sum")}) + date_range = (pd.Timestamp(0), pd.Timestamp(1500)) + table = lib.read(sym, date_range=date_range, query_builder=q).data + expected = pd.DataFrame({"to_sum": [6]}, index=[pd.Timestamp(0)]) + assert_frame_equal_with_arrow(table, expected) From 3ebb24a736092ece0d8e0cefc251e37c2b98a294 Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Thu, 2 Oct 2025 18:09:19 +0300 Subject: [PATCH 2/3] Fix arrow reading empty frames During symbol concat we can end up with a Segment with zero columns. Convert that to arrow gracefully. Also uncovered that arrow normalization doesn't correctly construct pandas_metadata for empty dataframes. This is also fixed and tested in this PR. --- cpp/arcticdb/arrow/arrow_utils.cpp | 5 ++ .../arcticdb/version_store/_normalization.py | 4 +- python/arcticdb/version_store/_store.py | 6 ++- .../unit/arcticdb/version_store/test_arrow.py | 47 +++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/cpp/arcticdb/arrow/arrow_utils.cpp b/cpp/arcticdb/arrow/arrow_utils.cpp index 863540abae..c5ca967997 100644 --- a/cpp/arcticdb/arrow/arrow_utils.cpp +++ b/cpp/arcticdb/arrow/arrow_utils.cpp @@ -63,6 +63,11 @@ std::vector arrow_arrays_from_column(const Column& column, std:: std::shared_ptr> segment_to_arrow_data(SegmentInMemory& segment) { const auto total_blocks = segment.num_blocks(); const auto num_columns = segment.num_columns(); + if (num_columns == 0) { + // We can't construct a record batch with no columns, so in this case we return an empty list of record batches, + // which needs special handling in python. + return {}; + } const auto column_blocks = segment.column(0).num_blocks(); util::check(total_blocks == column_blocks * num_columns, "Expected regular block size"); diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index a49ce527b5..cc1df90e7e 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -742,7 +742,9 @@ def denormalize(self, item, norm_meta): index_type = pandas_meta.WhichOneof("index_type") if index_type == "index": index_meta = pandas_meta.index - if index_meta.is_physically_stored: + # Empty tables don't have `is_physically_stored=True` but we still output them with an empty DateTimeIndex. + is_empty_table_with_datetime_index = len(item) == 0 and not index_meta.step + if index_meta.is_physically_stored or is_empty_table_with_datetime_index: pandas_indexes = 1 if index_meta.tz: timezones[0] = index_meta.tz diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 091ea9fdd6..8795e2ae06 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -2432,7 +2432,11 @@ def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem: record_batches = [] for record_batch in frame_data.extract_record_batches(): record_batches.append(pa.RecordBatch._import_from_c(record_batch.array(), record_batch.schema())) - table = pa.Table.from_batches(record_batches) + if len(record_batches) == 0: + # We get an empty list of record batches when output has no columns + table = pa.Table.from_arrays([]) + else: + table = pa.Table.from_batches(record_batches) data = self._arrow_normalizer.denormalize(table, read_result.norm) else: data = self._normalizer.denormalize(read_result.frame_data, read_result.norm) diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py index 91efb06116..95a102d60e 100644 --- a/python/tests/unit/arcticdb/version_store/test_arrow.py +++ b/python/tests/unit/arcticdb/version_store/test_arrow.py @@ -75,6 +75,35 @@ def test_bool_columns(lmdb_version_store_arrow): assert_frame_equal_with_arrow(table, df) +def test_read_empty(lmdb_version_store_arrow): + lib = lmdb_version_store_arrow + sym = "sym" + df = pd.DataFrame() + lib.write(sym, df) + table = lib.read(sym).data + expected = lib.read(sym, output_format=OutputFormat.PANDAS).data + # During normalization when doing the write we attach an empty DateTimeIndex to the DataFrame. We correctly see it + # in arrow + assert table.column_names == ["index"] + assert table.shape == (0, 1) + # arcticdb read(output_format=PANDAS) produces `pd.RangeIndex(start=0, stop=0, step=1)` column index if no columns + # pyarrow to_pandas produces `pd.Index([])` if no columns. + expected.columns = pd.Index([]) + assert_frame_equal_with_arrow(table, expected) + + +def test_read_empty_with_columns(lmdb_version_store_arrow): + lib = lmdb_version_store_arrow + sym = "sym" + df = pd.DataFrame({"col_int": np.zeros(0, dtype=np.int32), "col_float": np.zeros(0, dtype=np.float64)}) + lib.write(sym, df) + table = lib.read(sym).data + expected = lib.read(sym, output_format=OutputFormat.PANDAS).data + assert table.column_names == ["index", "col_int", "col_float"] + assert table.shape == (0, 3) + assert_frame_equal_with_arrow(table, expected) + + def test_column_filtering(lmdb_version_store_arrow): lib = lmdb_version_store_arrow df = pd.DataFrame({"x": np.arange(10), "y": np.arange(10.0, 20.0)}) @@ -925,3 +954,21 @@ def test_resample_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_s table = lib.read(sym, date_range=date_range, query_builder=q).data expected = pd.DataFrame({"to_sum": [6]}, index=[pd.Timestamp(0)]) assert_frame_equal_with_arrow(table, expected) + + +def test_symbol_concat_empty_intersection(lmdb_version_store_arrow): + # Tests a failing subset of test_symbol_concat_empty_column_intersection + # TODO: Remove this test if we enable pipeline tests with arrow + lib = lmdb_version_store_arrow + sym_0 = "sym_0" + sym_1 = "sym_1" + df_0 = pd.DataFrame({"col_0": [0]}) + df_1 = pd.DataFrame({"col_1": [1]}) + lib.write(sym_0, df_0) + lib.write(sym_1, df_1) + q = QueryBuilder().concat("inner") + table = lib.batch_read_and_join([sym_0, sym_1], query_builder=q).data + assert table.column_names == [] + assert table.shape == (0, 0) + expected = pd.DataFrame() + assert_frame_equal_with_arrow(table, expected) \ No newline at end of file From 052433ff226d069f9deb1d2ac5692e6f39f8e2c8 Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Mon, 25 Aug 2025 11:21:14 +0300 Subject: [PATCH 3/3] Parametrize all pipeline tests to run with arrow output format --- python/arcticdb/version_store/_store.py | 28 ++- .../arcticdb/test_aggregation_hypothesis.py | 14 +- .../hypothesis/arcticdb/test_resample.py | 35 ++-- .../version_store/test_basic_version_store.py | 63 +++--- .../tests/unit/arcticdb/test_column_stats.py | 53 +++-- .../version_store/test_aggregation.py | 123 ++++++++---- .../arcticdb/version_store/test_filtering.py | 182 ++++++++++++------ .../test_filtering_hypothesis.py | 46 +++-- .../unit/arcticdb/version_store/test_head.py | 46 +++-- .../version_store/test_lazy_dataframe.py | 77 +++++--- .../arcticdb/version_store/test_projection.py | 26 ++- .../test_projection_hypothesis.py | 14 +- .../version_store/test_query_builder.py | 2 +- .../version_store/test_query_builder_batch.py | 23 ++- .../test_query_builder_sparse.py | 8 +- .../arcticdb/version_store/test_resample.py | 129 +++++++++---- .../arcticdb/version_store/test_row_range.py | 32 +-- .../test_symbol_concatenation.py | 90 ++++++--- .../unit/arcticdb/version_store/test_tail.py | 46 +++-- .../arcticdb/version_store/test_ternary.py | 80 +++++--- 20 files changed, 725 insertions(+), 392 deletions(-) diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 8795e2ae06..7c7dab6820 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -90,6 +90,8 @@ from packaging.version import Version import arcticdb_ext as ae +from arcticdb.util.arrow import stringify_dictionary_encoded_columns + IS_WINDOWS = sys.platform == "win32" FlattenResult = namedtuple("FlattenResult", ["is_recursive_normalize_preferred", "metastruct", "to_write"]) @@ -354,12 +356,17 @@ def _initialize(self, library, env, lib_cfg, custom_normalizer, open_mode, nativ self._open_mode = open_mode self._native_cfg = native_cfg self._runtime_options = runtime_options + self._test_convert_arrow_back_to_pandas = False def set_output_format(self, output_format: Union[OutputFormat, str]): if self._runtime_options is None: self._runtime_options = RuntimeOptions() self._runtime_options.set_output_format(output_format) + def _set_output_format_for_pipeline_tests(self, output_format): + self.set_output_format(output_format) + self._test_convert_arrow_back_to_pandas = True + @classmethod def create_store_from_lib_config(cls, lib_cfg, env, open_mode=OpenMode.DELETE, native_cfg=None): lib = cls.create_lib_from_lib_config(lib_cfg, env, open_mode, native_cfg) @@ -740,6 +747,9 @@ def _resolve_dynamic_strings(self, kwargs): "Windows only supports dynamic_strings=True, using dynamic strings despite configuration or kwarg" ) dynamic_strings = True + if self._test_convert_arrow_back_to_pandas: + # TODO: Hackery, maybe better to skip + dynamic_strings = True return dynamic_strings last_mismatch_msg: Optional[str] = None @@ -2438,6 +2448,18 @@ def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem: else: table = pa.Table.from_batches(record_batches) data = self._arrow_normalizer.denormalize(table, read_result.norm) + if self._test_convert_arrow_back_to_pandas: + # TODO: Deduplicate with convert_arrow_to_pandas_and_remove_categoricals + data = stringify_dictionary_encoded_columns(data) + for i, name in enumerate(data.column_names): + if pa.types.is_integer(data.column(i).type): + new_col = data.column(i).fill_null(0) + data = data.set_column(i, name, new_col) + # TODO: Copy this to convert_arrow_to_pandas_and_remove_categoricals + if pa.types.is_boolean(data.column(i).type): + new_col = data.column(i).fill_null(False) + data = data.set_column(i, name, new_col) + data = data.to_pandas() else: data = self._normalizer.denormalize(read_result.frame_data, read_result.norm) if read_result.norm.HasField("custom"): @@ -2683,9 +2705,6 @@ def add_to_snapshot( """ Add items to a snapshot. Will replace if the snapshot already contains an entry for a particular symbol. - Note: attempt to add non-existing symbol or version to a snapshot will not fail, but will have no effect - on the snapshot. - Parameters ---------- snap_name : `str` @@ -2703,9 +2722,6 @@ def remove_from_snapshot(self, snap_name: str, symbols: List[str], versions: Lis """ Remove items from a snapshot - Note: attempt to remove non-existing symbol or version from a snapshot will not fail, but will have no effect - on the snapshot. - Parameters ---------- snap_name : `str` diff --git a/python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py b/python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py index a7f4d294fa..c71441143f 100644 --- a/python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py +++ b/python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py @@ -24,7 +24,7 @@ ) -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered @use_of_function_scoped_fixtures_in_hypothesis_checked @@ -37,9 +37,10 @@ ], ), ) -def test_aggregation_numeric(lmdb_version_store_v1, df): +def test_aggregation_numeric(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_numeric" lib.write(symbol, df) @@ -71,9 +72,10 @@ def test_aggregation_numeric(lmdb_version_store_v1, df): ], ), ) -def test_aggregation_strings(lmdb_version_store_v1, df): +def test_aggregation_strings(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_strings" lib.write(symbol, df) @@ -116,12 +118,13 @@ def aggregation_dataframe_list_strategy(draw): @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(dfs=aggregation_dataframe_list_strategy()) -def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, dfs): +def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, dfs): agg_column_dtypes = [df["agg_column"].dtype for df in dfs if "agg_column" in df.columns] common_agg_type = functools.reduce(valid_common_type, agg_column_dtypes) if len(agg_column_dtypes) > 0 else None assume(any("grouping_column" in df.columns for df in dfs) and common_agg_type is not None) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_numeric_dynamic" lib.delete(symbol) for df in dfs: @@ -160,9 +163,10 @@ def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, dfs): ], ), ) -def test_aggregation_strings_dynamic(lmdb_version_store_dynamic_schema_v1, df): +def test_aggregation_strings_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df): assume(len(df) >= 3) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_strings_dynamic" lib.delete(symbol) slices = [ diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py index 7ee2f34caa..51e5c34737 100644 --- a/python/tests/hypothesis/arcticdb/test_resample.py +++ b/python/tests/hypothesis/arcticdb/test_resample.py @@ -18,19 +18,10 @@ COLUMN_DTYPE = ["float", "int", "uint"] ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] -# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well. -MIN_DATE = np.datetime64("1960-01-01") -MAX_DATE = np.datetime64("2025-01-01") +MIN_DATE = np.datetime64("1969-06-01") +MAX_DATE = np.datetime64("1970-06-01") -pytestmark = pytest.mark.pipeline - - -def dense_row_count_in_resampled_dataframe(df_list, rule): - """ - The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling - with `rule`. Assumes df_list is sorted by start date and the indexes are not overlapping. - """ - return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value +pytestmark = pytest.mark.pipeline # Covered @st.composite @@ -111,14 +102,14 @@ def freq_fits_in_64_bits(count, unit): This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer. """ billion = 1_000_000_000 - mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1} + mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion} return (mult[unit] * count).bit_length() <= 63 @st.composite def rule(draw): count = draw(st.integers(min_value=1, max_value=10_000)) - unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"])) + unit = draw(st.sampled_from(["min", "h", "s"])) result = f"{count}{unit}" assume(freq_fits_in_64_bits(count=count, unit=unit)) return result @@ -126,7 +117,7 @@ def rule(draw): @st.composite def offset(draw): - unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None])) + unit = draw(st.sampled_from(["s", "min", "h", None])) if unit is None: return None count = draw(st.integers(min_value=1, max_value=100)) @@ -181,11 +172,9 @@ def dynamic_schema_column_list(draw): origin=origin(), offset=offset(), ) -def test_resample(lmdb_version_store_v1, df, rule, origin, offset): - # The assumption below is to avoid OOM-ing the GitHub runners. - assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000) - +def test_resample(lmdb_version_store_v1, any_output_format, df, rule, origin, offset): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "sym" logger = get_logger() logger.info(f"Data frame generated has {df.shape[0]} rows") @@ -231,12 +220,12 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset): @use_of_function_scoped_fixtures_in_hypothesis_checked @given(df_list=dynamic_schema_column_list(), rule=rule(), origin=origin(), offset=offset()) @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large]) -def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset): - # The assumption below is to avoid OOM-ing the GitHub runners. - assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000) - +def test_resample_dynamic_schema( + lmdb_version_store_dynamic_schema_v1, any_output_format, df_list, rule, origin, offset +): common_column_types = compute_common_type_for_columns_in_df_list(df_list) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) lib.version_store.clear() sym = "sym" agg = {f"{name}_{op}": (name, op) for name in common_column_types for op in ALL_AGGREGATIONS} diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py index a333c5e0a1..db693d147c 100644 --- a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py +++ b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py @@ -27,7 +27,6 @@ ArcticDbNotYetImplemented, InternalException, UserInputException, - ArcticException, ) from arcticdb import QueryBuilder from arcticdb.flattener import Flattener @@ -36,12 +35,7 @@ from arcticdb.version_store._store import VersionedItem from arcticdb_ext.exceptions import _ArcticLegacyCompatibilityException, StorageException from arcticdb_ext.storage import KeyType, NoDataFoundException -from arcticdb_ext.version_store import ( - NoSuchVersionException, - StreamDescriptorMismatch, - ManualClockVersionStore, - DataError, -) +from arcticdb_ext.version_store import NoSuchVersionException, StreamDescriptorMismatch, ManualClockVersionStore from arcticdb.util.test import ( sample_dataframe, sample_dataframe_only_strings, @@ -51,12 +45,10 @@ config_context, distinct_timestamps, ) -from tests.conftest import Marks from tests.util.date import DateRange from arcticdb.util.test import equals from arcticdb.version_store._store import resolve_defaults from tests.util.mark import MACOS, MACOS_WHEEL_BUILD, xfail_azure_chars -from tests.util.marking import marks @pytest.fixture() @@ -847,9 +839,11 @@ def test_range_index(basic_store, sym): assert_equal(expected, vit.data) +@pytest.mark.pipeline # Covered @pytest.mark.parametrize("use_date_range_clause", [True, False]) -@marks([Marks.pipeline, Marks.storage]) -def test_date_range(basic_store, use_date_range_clause): +@pytest.mark.storage +def test_date_range(basic_store, use_date_range_clause, any_output_format): + basic_store._set_output_format_for_pipeline_tests(any_output_format) initial_timestamp = pd.Timestamp("2019-01-01") df = pd.DataFrame(data=np.arange(100), index=pd.date_range(initial_timestamp, periods=100)) sym = "date_test" @@ -895,9 +889,11 @@ def test_date_range(basic_store, use_date_range_clause): assert data_closed[data_closed.columns[0]][-1] == end_offset +@pytest.mark.pipeline # Covered @pytest.mark.parametrize("use_date_range_clause", [True, False]) -@marks([Marks.pipeline, Marks.storage]) -def test_date_range_none(basic_store, use_date_range_clause): +@pytest.mark.storage +def test_date_range_none(basic_store, use_date_range_clause, any_output_format): + basic_store._set_output_format_for_pipeline_tests(any_output_format) sym = "date_test2" rows = 100 initial_timestamp = pd.Timestamp("2019-01-01") @@ -914,9 +910,11 @@ def test_date_range_none(basic_store, use_date_range_clause): assert len(data) == rows +@pytest.mark.pipeline # Covered @pytest.mark.parametrize("use_date_range_clause", [True, False]) -@marks([Marks.pipeline, Marks.storage]) -def test_date_range_start_equals_end(basic_store, use_date_range_clause): +@pytest.mark.storage +def test_date_range_start_equals_end(basic_store, use_date_range_clause, any_output_format): + basic_store._set_output_format_for_pipeline_tests(any_output_format) sym = "date_test2" rows = 100 initial_timestamp = pd.Timestamp("2019-01-01") @@ -936,10 +934,12 @@ def test_date_range_start_equals_end(basic_store, use_date_range_clause): assert data[data.columns[0]][0] == start_offset +@pytest.mark.pipeline # Covered @pytest.mark.parametrize("use_date_range_clause", [True, False]) -@marks([Marks.pipeline, Marks.storage]) -def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause): +@pytest.mark.storage +def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause, any_output_format): lib = basic_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_date_range_row_sliced" # basic_store_tiny_segment produces 2x2 segments num_rows = 6 @@ -1705,7 +1705,7 @@ def test_batch_write_then_list_symbol_without_cache(basic_store_factory): assert set(lib.list_symbols()) == set(symbols) -@marks([Marks.storage, Marks.dedup]) +@pytest.mark.storage def test_batch_write_missing_keys_dedup(basic_store_factory): """When there is duplicate data to reuse for the current write, we need to access the index key of the previous versions in order to refer to the corresponding keys for the deduplicated data.""" @@ -2265,26 +2265,6 @@ def test_batch_read_meta_multiple_versions(object_version_store): assert results_dict["sym3"][0].metadata == {"meta3": 1} assert results_dict["sym2"][3].metadata == {"meta2": 4} - # We can supply only an array of symbols, including repeating symbols - results_dict = lib.batch_read_metadata_multi(["sym1", "sym2", "sym1", "sym3", "sym2", "sym1", "sym1"]) - assert results_dict["sym1"][2].metadata == {"meta1": 3} - assert len(results_dict["sym1"]) == 1 - assert results_dict["sym2"][3].metadata == {"meta2": 4} - assert results_dict["sym3"][0].metadata == {"meta3": 1} - - # The lists are of different sizr - with pytest.raises(ArcticException): - results_dict = lib.batch_read_metadata_multi(["sym1", "sym2"], [0, 0, -2]) - - # With negative number we can go back from current versions - assert lib.batch_read_metadata_multi(["sym1", "sym1"], [-1, -2]) == lib.batch_read_metadata_multi( - ["sym1", "sym1"], [2, 1] - ) - - # Check DataError is thrown when requesting non-existing version - with pytest.raises(TypeError): # Not a good error though - issue 10070002655 - results_dict = lib.batch_read_metadata_multi(["sym1"], [10]) - @pytest.mark.storage def test_list_symbols(basic_store): @@ -2830,10 +2810,12 @@ def test_batch_append_with_throw_exception(basic_store, three_col_df): ) +@pytest.mark.pipeline # Covered @pytest.mark.parametrize("use_date_range_clause", [True, False]) -@marks([Marks.pipeline, Marks.storage]) -def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause): +@pytest.mark.storage +def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause, any_output_format): lmdb_version_store = basic_store_tombstone_and_sync_passive + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) symbols = [] for i in range(5): symbols.append("sym_{}".format(i)) @@ -2872,7 +2854,6 @@ def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_ @pytest.mark.parametrize("use_row_range_clause", [True, False]) -@marks([Marks.pipeline]) def test_batch_read_row_range(lmdb_version_store_v1, use_row_range_clause): lib = lmdb_version_store_v1 num_symbols = 5 diff --git a/python/tests/unit/arcticdb/test_column_stats.py b/python/tests/unit/arcticdb/test_column_stats.py index 02245cce49..4dd9dee577 100644 --- a/python/tests/unit/arcticdb/test_column_stats.py +++ b/python/tests/unit/arcticdb/test_column_stats.py @@ -15,7 +15,7 @@ from arcticdb_ext.version_store import NoSuchVersionException -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered df0 = pd.DataFrame( @@ -53,8 +53,9 @@ def assert_stats_equal(received, expected): pd.testing.assert_index_equal(received.index, expected.index) -def test_column_stats_basic_flow(lmdb_version_store_tiny_segment): +def test_column_stats_basic_flow(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_basic_flow" expected_column_stats = generate_symbol(lib, sym) expected_column_stats.drop( @@ -82,8 +83,9 @@ def test_column_stats_basic_flow(lmdb_version_store_tiny_segment): lib.read_column_stats(sym) -def test_column_stats_infinity(lmdb_version_store_tiny_segment): +def test_column_stats_infinity(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_infinity" df0 = pd.DataFrame({"col_1": [np.inf, 0.5]}, index=pd.date_range("2000-01-01", periods=2)) df1 = pd.DataFrame({"col_1": [1.5, -np.inf]}, index=pd.date_range("2000-01-03", periods=2)) @@ -108,8 +110,9 @@ def test_column_stats_infinity(lmdb_version_store_tiny_segment): assert_stats_equal(column_stats, expected_column_stats) -def test_column_stats_as_of(lmdb_version_store_tiny_segment): +def test_column_stats_as_of(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_as_of" expected_column_stats = generate_symbol(lib, sym) expected_column_stats = expected_column_stats.iloc[[0]] @@ -136,8 +139,9 @@ def test_column_stats_as_of(lmdb_version_store_tiny_segment): lib.read_column_stats(sym, as_of=0) -def test_column_stats_as_of_version_doesnt_exist(lmdb_version_store_tiny_segment): +def test_column_stats_as_of_version_doesnt_exist(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_as_of_version_doesnt_exist" generate_symbol(lib, sym) @@ -153,8 +157,9 @@ def test_column_stats_as_of_version_doesnt_exist(lmdb_version_store_tiny_segment # TODO: When more than one column stat type is implemented, change this to add multiple indexes across multiple columns -def test_column_stats_multiple_indexes_different_columns(lmdb_version_store_tiny_segment): +def test_column_stats_multiple_indexes_different_columns(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_multiple_indexes" expected_column_stats = generate_symbol(lib, sym) @@ -177,8 +182,9 @@ def test_column_stats_multiple_indexes_different_columns(lmdb_version_store_tiny assert_stats_equal(column_stats, expected_column_stats) -def test_column_stats_empty_dict(lmdb_version_store_tiny_segment): +def test_column_stats_empty_dict(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_empty_dict" expected_column_stats = generate_symbol(lib, sym) @@ -196,8 +202,9 @@ def test_column_stats_empty_dict(lmdb_version_store_tiny_segment): assert_stats_equal(lib.read_column_stats(sym), expected_column_stats) -def test_column_stats_empty_set(lmdb_version_store_tiny_segment): +def test_column_stats_empty_set(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_empty_set" expected_column_stats = generate_symbol(lib, sym) @@ -215,8 +222,9 @@ def test_column_stats_empty_set(lmdb_version_store_tiny_segment): assert_stats_equal(lib.read_column_stats(sym), expected_column_stats) -def test_column_stats_non_existent_column(lmdb_version_store_tiny_segment): +def test_column_stats_non_existent_column(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_non_existent_column" expected_column_stats = generate_symbol(lib, sym) @@ -233,8 +241,9 @@ def test_column_stats_non_existent_column(lmdb_version_store_tiny_segment): assert_stats_equal(lib.read_column_stats(sym), expected_column_stats) -def test_column_stats_non_existent_stat_type(lmdb_version_store_tiny_segment): +def test_column_stats_non_existent_stat_type(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_non_existent_stat_type" expected_column_stats = generate_symbol(lib, sym) @@ -250,8 +259,9 @@ def test_column_stats_non_existent_stat_type(lmdb_version_store_tiny_segment): assert_stats_equal(lib.read_column_stats(sym), expected_column_stats) -def test_column_stats_pickled_symbol(lmdb_version_store_tiny_segment): +def test_column_stats_pickled_symbol(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_pickled_symbol" lib.write(sym, 1) assert lib.is_symbol_pickled(sym) @@ -261,8 +271,9 @@ def test_column_stats_pickled_symbol(lmdb_version_store_tiny_segment): lib.create_column_stats(sym, column_stats_dict) -def test_column_stats_multiple_creates(lmdb_version_store_tiny_segment): +def test_column_stats_multiple_creates(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_multiple_creates" base_expected_column_stats = generate_symbol(lib, sym) @@ -295,8 +306,9 @@ def test_column_stats_multiple_creates(lmdb_version_store_tiny_segment): assert_stats_equal(column_stats, base_expected_column_stats) -def test_column_stats_string_column_minmax(lmdb_version_store_tiny_segment): +def test_column_stats_string_column_minmax(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_string_column_minmax" generate_symbol(lib, sym) @@ -305,8 +317,9 @@ def test_column_stats_string_column_minmax(lmdb_version_store_tiny_segment): lib.create_column_stats(sym, column_stats_dict) -def test_column_stats_duplicated_primary_index(lmdb_version_store_tiny_segment): +def test_column_stats_duplicated_primary_index(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_duplicated_primary_index" total_df = pd.concat((df0, df1)) @@ -329,8 +342,9 @@ def test_column_stats_duplicated_primary_index(lmdb_version_store_tiny_segment): assert_stats_equal(column_stats, expected_column_stats) -def test_column_stats_dynamic_schema_missing_data(lmdb_version_store_tiny_segment_dynamic): +def test_column_stats_dynamic_schema_missing_data(lmdb_version_store_tiny_segment_dynamic, any_output_format): lib = lmdb_version_store_tiny_segment_dynamic + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_dynamic_schema_missing_data" df0 = pd.DataFrame({"col_1": [0.1, 0.2], "col_2": [0.3, 0.4]}, index=pd.date_range("2000-01-01", periods=2)) @@ -392,8 +406,9 @@ def test_column_stats_dynamic_schema_missing_data(lmdb_version_store_tiny_segmen assert_stats_equal(column_stats, expected_column_stats) -def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segment_dynamic): +def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segment_dynamic, any_output_format): lib = lmdb_version_store_tiny_segment_dynamic + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_column_stats_dynamic_schema_types_changing" df0 = pd.DataFrame( @@ -506,7 +521,7 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm assert column_stats.dtypes["v1.0_MAX(float_to_int)"] == np.float64 -def test_column_stats_object_deleted_with_index_key(lmdb_version_store): +def test_column_stats_object_deleted_with_index_key(lmdb_version_store, any_output_format): def clear(): nonlocal expected_count lib.version_store.clear() @@ -629,6 +644,7 @@ def test_prune_previous_api(): assert_column_stats_key_count() lib = lmdb_version_store + lib._set_output_format_for_pipeline_tests(any_output_format) lib_tool = lib.library_tool() sym = "test_column_stats_object_deleted_with_index_key" column_stats_dict = {"col_1": {"MINMAX"}} @@ -654,7 +670,7 @@ def test_prune_previous_api(): " issue is resolved" ) ) -def test_column_stats_object_deleted_with_index_key_batch_methods(lmdb_version_store): +def test_column_stats_object_deleted_with_index_key_batch_methods(lmdb_version_store, any_output_format): def clear(): nonlocal expected_count lib.version_store.clear() @@ -680,6 +696,7 @@ def test_prune_previous_kwarg_batch_methods(): clear() lib = lmdb_version_store + lib._set_output_format_for_pipeline_tests(any_output_format) lib_tool = lib.library_tool() sym = "test_column_stats_object_deleted_with_index_key_batch_methods" column_stats_dict = {"col_1": {"MINMAX"}} diff --git a/python/tests/unit/arcticdb/version_store/test_aggregation.py b/python/tests/unit/arcticdb/version_store/test_aggregation.py index 0084cb76ca..ca9c184b00 100644 --- a/python/tests/unit/arcticdb/version_store/test_aggregation.py +++ b/python/tests/unit/arcticdb/version_store/test_aggregation.py @@ -21,11 +21,12 @@ valid_common_type, ) -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered -def test_group_on_float_column_with_nans(lmdb_version_store_v1): +def test_group_on_float_column_with_nans(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_on_float_column_with_nans" df = pd.DataFrame({"grouping_column": [1.0, 2.0, np.nan, 1.0, 2.0, 2.0], "agg_column": [1, 2, 3, 4, 5, 6]}) lib.write(symbol, df) @@ -34,8 +35,9 @@ def test_group_on_float_column_with_nans(lmdb_version_store_v1): # TODO: Add first and last once un-feature flagged @pytest.mark.parametrize("aggregator", ("sum", "min", "max", "mean", "count")) -def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, aggregator): +def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, any_output_format, aggregator): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregate_float_columns_with_nans" df = pd.DataFrame( { @@ -47,8 +49,9 @@ def test_aggregate_float_columns_with_nans(lmdb_version_store_v1, aggregator): generic_aggregation_test(lib, symbol, df, "grouping_column", {"agg_column": aggregator}) -def test_count_aggregation(lmdb_version_store_v1): +def test_count_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_count_aggregation" df = DataFrame( { @@ -62,8 +65,9 @@ def test_count_aggregation(lmdb_version_store_v1): @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") -def test_first_aggregation(lmdb_version_store_v1): +def test_first_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_first_aggregation" df = DataFrame( { @@ -77,8 +81,9 @@ def test_first_aggregation(lmdb_version_store_v1): @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") -def test_first_agg_with_append(lmdb_version_store_v1): +def test_first_agg_with_append(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_first_agg_with_append" df_0 = pd.DataFrame({"grouping_column": [0], "to_first": [10.0]}) df_1 = pd.DataFrame({"grouping_column": [1], "to_first": [30.0]}) @@ -90,8 +95,9 @@ def test_first_agg_with_append(lmdb_version_store_v1): @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") -def test_last_aggregation(lmdb_version_store_v1): +def test_last_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_last_aggregation" df = DataFrame( { @@ -115,8 +121,9 @@ def test_last_aggregation(lmdb_version_store_v1): @pytest.mark.skip(reason="Feature flagged off until working with string columns and dynamic schema") -def test_last_agg_with_append(lmdb_version_store_v1): +def test_last_agg_with_append(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_last_agg_with_append" df_0 = pd.DataFrame({"grouping_column": [0], "to_last": [10.0]}) df_1 = pd.DataFrame({"grouping_column": [1], "to_last": [30.0]}) @@ -127,8 +134,9 @@ def test_last_agg_with_append(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, pd.concat([df_0, df_1, df_2]), "grouping_column", {"to_last": "last"}) -def test_sum_aggregation(lmdb_version_store_v1): +def test_sum_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_sum_aggregation" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]}, @@ -138,8 +146,9 @@ def test_sum_aggregation(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_bool(lmdb_version_store_v1): +def test_sum_aggregation_bool(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_sum_aggregation" df = DataFrame( { @@ -152,8 +161,9 @@ def test_sum_aggregation_bool(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_mean_aggregation(lmdb_version_store_v1): +def test_mean_aggregation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_mean_aggregation" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_mean": [1, 1, 2, 2, 2]}, @@ -163,8 +173,9 @@ def test_mean_aggregation(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_mean_aggregation_float(lmdb_version_store_v1): +def test_mean_aggregation_float(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_mean_aggregation_float" df = DataFrame( { @@ -177,8 +188,9 @@ def test_mean_aggregation_float(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_mean_aggregation_timestamp(lmdb_version_store_v1): +def test_mean_aggregation_timestamp(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_mean_aggregation_float" df = DataFrame( { @@ -206,8 +218,9 @@ def test_mean_aggregation_timestamp(lmdb_version_store_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_mean": "mean"}) -def test_named_agg(lmdb_version_store_tiny_segment): +def test_named_agg(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_named_agg" gen = np.random.default_rng() df = DataFrame({"grouping_column": [1, 1, 1, 2, 3, 4], "agg_column": gen.random(6)}) @@ -235,16 +248,18 @@ def test_named_agg(lmdb_version_store_tiny_segment): assert_frame_equal(expected, received, check_dtype=False) -def test_max_minus_one(lmdb_version_store_v1): +def test_max_minus_one(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_max_minus_one" df = pd.DataFrame({"grouping_column": ["thing"], "to_max": [-1]}) lib.write(symbol, df) generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_max": "max"}) -def test_group_empty_dataframe(lmdb_version_store_v1): +def test_group_empty_dataframe(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_empty_dataframe" df = DataFrame({"grouping_column": [], "to_mean": []}) lib.write(symbol, df) @@ -253,8 +268,9 @@ def test_group_empty_dataframe(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_group_pickled_symbol(lmdb_version_store_v1): +def test_group_pickled_symbol(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_pickled_symbol" lib.write(symbol, np.arange(100).tolist()) assert lib.is_symbol_pickled(symbol) @@ -263,8 +279,9 @@ def test_group_pickled_symbol(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_group_column_not_present(lmdb_version_store_v1): +def test_group_column_not_present(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_column_not_present" df = DataFrame({"a": np.arange(2)}, index=np.arange(2)) lib.write(symbol, df) @@ -273,8 +290,9 @@ def test_group_column_not_present(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_group_column_splitting(lmdb_version_store_tiny_segment): +def test_group_column_splitting(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_column_splitting" df = DataFrame( { @@ -295,8 +313,9 @@ def test_group_column_splitting(lmdb_version_store_tiny_segment): ) -def test_group_column_splitting_strings(lmdb_version_store_tiny_segment): +def test_group_column_splitting_strings(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_column_splitting" df = DataFrame( { @@ -317,8 +336,9 @@ def test_group_column_splitting_strings(lmdb_version_store_tiny_segment): ) -def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store_factory): +def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store_factory, any_output_format): lib = version_store_factory(column_group_size=2, segment_row_size=2, dynamic_strings=True) + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_with_nones_and_nans_in_string_grouping_column" # Structured so that the row-slices of the grouping column contain: # 1 - All strings @@ -351,8 +371,9 @@ def test_aggregation_with_nones_and_nans_in_string_grouping_column(version_store generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1): +def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = DataFrame({"grouping_column": ["group_1", "group_1", "group_1"], "to_max": [1, 5, 4]}, index=np.arange(3)) q = QueryBuilder() q = q.groupby("grouping_column").agg({"to_max": "max"}) @@ -364,8 +385,9 @@ def test_doctring_example_query_builder_groupby_max(lmdb_version_store_v1): assert_frame_equal(res.data, df) -def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store_v1): +def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1"], "to_mean": [1.1, 1.4, 2.5], "to_max": [1.1, 1.4, 2.5]}, index=np.arange(3), @@ -387,8 +409,9 @@ def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store ################################## -def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_count_aggregation_dynamic" df = DataFrame( { @@ -404,8 +427,9 @@ def test_count_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): @pytest.mark.xfail(reason="Not supported yet") -def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_first_aggregation_dynamic" df = DataFrame( { @@ -421,8 +445,9 @@ def test_first_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): @pytest.mark.xfail(reason="Not supported yet") -def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_last_aggregation_dynamic" df = DataFrame( { @@ -447,8 +472,9 @@ def test_last_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_last": "last"}) -def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_sum_aggregation_dynamic" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]}, @@ -460,8 +486,11 @@ def test_sum_aggregation_dynamic(lmdb_version_store_dynamic_schema_v1): generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_dynamic_bool_missing_aggregated_column(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_dynamic_bool_missing_aggregated_column( + lmdb_version_store_dynamic_schema_v1, any_output_format +): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_sum_aggregation_dynamic" df = DataFrame( {"grouping_column": ["group_1", "group_2"], "to_sum": [True, False]}, @@ -472,8 +501,9 @@ def test_sum_aggregation_dynamic_bool_missing_aggregated_column(lmdb_version_sto generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_sum_aggregation_with_range_index_dynamic" df = DataFrame( {"grouping_column": ["group_1", "group_1", "group_1", "group_2", "group_2"], "to_sum": [1, 1, 2, 2, 2]} @@ -484,8 +514,9 @@ def test_sum_aggregation_with_range_index_dynamic(lmdb_version_store_dynamic_sch generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_empty_dataframe_dynamic" df = DataFrame({"grouping_column": [], "to_mean": []}) lib.write(symbol, df) @@ -494,8 +525,9 @@ def test_group_empty_dataframe_dynamic(lmdb_version_store_dynamic_schema_v1): lib.read(symbol, query_builder=q) -def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_pickled_symbol_dynamic" lib.write(symbol, np.arange(100).tolist()) assert lib.is_symbol_pickled(symbol) @@ -504,8 +536,9 @@ def test_group_pickled_symbol_dynamic(lmdb_version_store_dynamic_schema_v1): lib.read(symbol, query_builder=q) -def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_group_column_not_present_dynamic" df = DataFrame({"a": np.arange(2)}, index=np.arange(2)) lib.write(symbol, df) @@ -515,8 +548,9 @@ def test_group_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1): @pytest.mark.parametrize("agg", ("max", "min", "mean", "sum")) -def test_segment_without_aggregation_column(lmdb_version_store_dynamic_schema_v1, agg): +def test_segment_without_aggregation_column(lmdb_version_store_dynamic_schema_v1, agg, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_segment_without_aggregation_column" write_df = pd.DataFrame({"grouping_column": ["group_0"], "aggregation_column": [10330.0]}) lib.write(symbol, write_df) @@ -527,8 +561,9 @@ def test_segment_without_aggregation_column(lmdb_version_store_dynamic_schema_v1 ) -def test_minimal_repro_type_change(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_change(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_minimal_repro_type_change" write_df = pd.DataFrame({"grouping_column": ["group_1"], "to_sum": [np.uint8(1)]}) lib.write(symbol, write_df) @@ -537,8 +572,9 @@ def test_minimal_repro_type_change(lmdb_version_store_dynamic_schema_v1): generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_sum": "sum"}) -def test_minimal_repro_type_change_max(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_change_max(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_minimal_repro_type_change_max" write_df = pd.DataFrame({"grouping_column": ["group_1"], "to_max": [np.uint8(1)]}) lib.write(symbol, write_df) @@ -547,16 +583,18 @@ def test_minimal_repro_type_change_max(lmdb_version_store_dynamic_schema_v1): generic_aggregation_test(lib, symbol, pd.concat([write_df, append_df]), "grouping_column", {"to_max": "max"}) -def test_minimal_repro_type_sum_similar_string_group_values(lmdb_version_store_dynamic_schema_v1): +def test_minimal_repro_type_sum_similar_string_group_values(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_minimal_repro_type_sum_similar_string_group_values" df = pd.DataFrame({"grouping_column": ["0", "000"], "to_sum": [1.0, 1.0]}) lib.write(symbol, df) generic_aggregation_test(lib, symbol, df, "grouping_column", {"to_sum": "sum"}) -def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_dynamic_schema_v1): +def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_aggregation_grouping_column_missing_from_row_group" write_df = DataFrame( {"to_sum": [1, 2], "grouping_column": ["group_1", "group_2"]}, @@ -582,7 +620,7 @@ def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_d @pytest.mark.parametrize("first_group", ["0", "1"]) @pytest.mark.parametrize("second_group", ["0", "1"]) def test_sum_aggregation_type( - lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype, first_group, second_group + lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype, first_group, second_group, any_output_format ): """ Sum aggregation promotes to the largest type of the respective category. int -> int64, uint -> uint64, float -> float64 @@ -591,6 +629,7 @@ def test_sum_aggregation_type( test we test all configurations of dtypes and grouping options (same group vs different group) """ lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df1 = pd.DataFrame({"grouping_column": [first_group], "to_sum": np.array([1], first_dtype)}) df2 = pd.DataFrame({"grouping_column": [second_group], "to_sum": np.array([1], second_dtype)}) lib.write("sym", df1) @@ -616,7 +655,7 @@ def test_sum_aggregation_type( @pytest.mark.parametrize("extremum", ["min", "max"]) @pytest.mark.parametrize("dtype, default_value", [(np.int32, 0), (np.float32, np.nan), (bool, False)]) def test_extremum_aggregation_with_missing_aggregation_column( - lmdb_version_store_dynamic_schema_v1, extremum, dtype, default_value + lmdb_version_store_dynamic_schema_v1, extremum, dtype, default_value, any_output_format ): """ Test that a sparse column will be backfilled with the correct values. @@ -624,6 +663,7 @@ def test_extremum_aggregation_with_missing_aggregation_column( because the aggregation column is missing, d2 will be the second row which will be dense and not backfilled. """ lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "sym" df1 = pd.DataFrame({"agg_column": np.array([0, 0], dtype)}) df2 = pd.DataFrame({"grouping_column": ["a"]}) @@ -640,8 +680,11 @@ def test_extremum_aggregation_with_missing_aggregation_column( assert_frame_equal(data, expected) -def test_mean_timestamp_aggregation_with_missing_aggregation_column(lmdb_version_store_dynamic_schema_v1): +def test_mean_timestamp_aggregation_with_missing_aggregation_column( + lmdb_version_store_dynamic_schema_v1, any_output_format +): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "sym" df1 = pd.DataFrame({"agg": [pd.Timestamp(1)], "grouping": [0]}) df2 = pd.DataFrame({"grouping": [0, 1, 2]}) diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py index 2819bea8ee..102ea8dfaa 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering.py @@ -36,11 +36,12 @@ from arcticdb.util._versions import IS_PANDAS_TWO, PANDAS_VERSION, IS_NUMPY_TWO -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered -def test_filter_column_not_present(lmdb_version_store_v1): +def test_filter_column_not_present(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": np.arange(2)}, index=np.arange(2)) q = QueryBuilder() q = q[q["b"] < 5] @@ -50,8 +51,9 @@ def test_filter_column_not_present(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_filter_column_attribute_syntax(lmdb_version_store_v1): +def test_filter_column_attribute_syntax(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_column_attribute_syntax" df = pd.DataFrame({"a": [np.uint8(1), np.uint8(0)]}) lib.write(symbol, df) @@ -67,8 +69,9 @@ def test_filter_infinite_value(): q = q[q["a"] < math.inf] -def test_filter_categorical(lmdb_version_store_v1): +def test_filter_categorical(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": ["hello", "hi", "hello"]}, index=np.arange(3)) df.a = df.a.astype("category") q = QueryBuilder() @@ -79,8 +82,9 @@ def test_filter_categorical(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_filter_date_range_row_indexed(lmdb_version_store_tiny_segment): +def test_filter_date_range_row_indexed(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_date_range_row_indexed" df = pd.DataFrame({"a": np.arange(3)}, index=np.arange(3)) lib.write(symbol, df) @@ -88,8 +92,9 @@ def test_filter_date_range_row_indexed(lmdb_version_store_tiny_segment): lib.read(symbol, date_range=(pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02"))) -def test_filter_explicit_index(lmdb_version_store_v1): +def test_filter_explicit_index(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": [np.uint8(1), np.uint8(0)]}, index=np.arange(2)) q = QueryBuilder() q = q[q["a"] < np.uint8(1)] @@ -99,8 +104,9 @@ def test_filter_explicit_index(lmdb_version_store_v1): assert_frame_equal(df.query(pandas_query), lib.read(symbol, query_builder=q).data) -def test_filter_clashing_values(lmdb_version_store_v1): +def test_filter_clashing_values(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_clashing_values" df = pd.DataFrame({"a": [10, 11, 12], "b": ["11", "12", "13"]}, index=np.arange(3)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -111,9 +117,10 @@ def test_filter_clashing_values(lmdb_version_store_v1): generic_filter_test_strings(lib, base_symbol, q, expected) -def test_filter_bool_nonbool_comparison(lmdb_version_store_v1): +def test_filter_bool_nonbool_comparison(lmdb_version_store_v1, any_output_format): symbol = "test_filter_bool_nonbool_comparison" lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"string": ["True", "False"], "numeric": [1, 0], "bool": [True, False]}, index=np.arange(2)) lib.write(symbol, df) @@ -149,8 +156,9 @@ def test_filter_bool_nonbool_comparison(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_filter_bool_column(lmdb_version_store_v1): +def test_filter_bool_column(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_bool_column" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) @@ -160,8 +168,9 @@ def test_filter_bool_column(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_bool_column_not(lmdb_version_store_v1): +def test_filter_bool_column_not(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_bool_column_not" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) @@ -171,8 +180,9 @@ def test_filter_bool_column_not(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_bool_column_binary_boolean(lmdb_version_store_v1): +def test_filter_bool_column_binary_boolean(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_bool_column_binary_boolean" df = pd.DataFrame({"a": [True, True, False, False], "b": [True, False, True, False]}, index=np.arange(4)) lib.write(symbol, df) @@ -182,8 +192,9 @@ def test_filter_bool_column_binary_boolean(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_bool_column_comparison(lmdb_version_store_v1): +def test_filter_bool_column_comparison(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_bool_column_comparison" df = pd.DataFrame({"a": [True, False]}, index=np.arange(2)) lib.write(symbol, df) @@ -213,8 +224,9 @@ def test_filter_bool_column_comparison(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_datetime_naive(lmdb_version_store_v1): +def test_filter_datetime_naive(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_datetime_simple" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10)}) lib.write(symbol, df) @@ -226,8 +238,9 @@ def test_filter_datetime_naive(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_datetime_isin(lmdb_version_store_v1): +def test_filter_datetime_isin(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_datetime_isin" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10)}) lib.write(symbol, df) @@ -239,8 +252,9 @@ def test_filter_datetime_isin(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_datetime_timedelta(lmdb_version_store_v1): +def test_filter_datetime_timedelta(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_datetime_timedelta" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10)}) pd_ts = pd.Timestamp("2000-01-05") @@ -262,8 +276,9 @@ def test_filter_datetime_timedelta(lmdb_version_store_v1): assert True -def test_filter_datetime_timezone_aware(lmdb_version_store_v1): +def test_filter_datetime_timezone_aware(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_datetime_timezone_aware" df = pd.DataFrame({"a": pd.date_range("2000-01-01", periods=10, tz=timezone("Europe/Amsterdam"))}) lib.write(symbol, df) @@ -277,8 +292,9 @@ def test_filter_datetime_timezone_aware(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_df_query_wrong_type(lmdb_version_store_v1): +def test_df_query_wrong_type(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df1 = pd.DataFrame( { @@ -340,8 +356,9 @@ def test_df_query_wrong_type(lmdb_version_store_v1): lib.read(sym, query_builder=q) -def test_filter_datetime_nanoseconds(lmdb_version_store_v1): +def test_filter_datetime_nanoseconds(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_filter_datetime_nanoseconds" # Dataframe has three rows and a single column containing timestamps 1 nanosecond apart @@ -381,8 +398,9 @@ def test_filter_datetime_nanoseconds(lmdb_version_store_v1): assert_frame_equal(second_and_third_row_result, df.iloc[[1, 2]].reset_index(drop=True)) -def test_filter_isin_clashing_sets(lmdb_version_store_v1): +def test_filter_isin_clashing_sets(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_isin_clashing_sets" a_unique_val = 100000 b_unique_val = 200000 @@ -408,8 +426,11 @@ def test_filter_isin_clashing_sets(lmdb_version_store_v1): ([-1, 0, 1], [0, 1, 2**62], [0, 1]), ], ) -def test_filter_numeric_isin_hashing_overflows(lmdb_version_store_v1, df_col, isin_vals, expected_col): +def test_filter_numeric_isin_hashing_overflows( + lmdb_version_store_v1, df_col, isin_vals, expected_col, any_output_format +): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": df_col}) lib.write("test_filter_numeric_isin_hashing_overflows", df) @@ -421,8 +442,9 @@ def test_filter_numeric_isin_hashing_overflows(lmdb_version_store_v1, df_col, is assert_frame_equal(expected, result) -def test_filter_numeric_isin_unsigned(lmdb_version_store_v1): +def test_filter_numeric_isin_unsigned(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": [0, 1, 2**64 - 1]}) lib.write("test_filter_numeric_isin_unsigned", df) @@ -441,8 +463,9 @@ def test_filter_numeric_isnotin_mixed_types_exception(): q = q[q["a"].isnotin(vals)] -def test_filter_numeric_isnotin_hashing_overflow(lmdb_version_store_v1): +def test_filter_numeric_isnotin_hashing_overflow(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": [256]}) lib.write("test_filter_numeric_isnotin_hashing_overflow", df) @@ -460,8 +483,11 @@ def test_filter_numeric_isnotin_hashing_overflow(lmdb_version_store_v1): @pytest.mark.parametrize("op", ("in", "not in")) @pytest.mark.parametrize("signed_type", (np.int8, np.int16, np.int32, np.int64)) @pytest.mark.parametrize("uint64_in", ("df", "vals") if PANDAS_VERSION >= Version("1.2") else ("vals",)) -def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store_v1, op, signed_type, uint64_in): +def test_filter_numeric_membership_mixing_int64_and_uint64( + lmdb_version_store_v1, op, signed_type, uint64_in, any_output_format +): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_membership_mixing_int64_and_uint64" signed = signed_type(-1) if uint64_in == "df": @@ -476,8 +502,9 @@ def test_filter_numeric_membership_mixing_int64_and_uint64(lmdb_version_store_v1 generic_filter_test(lib, symbol, q, expected) -def test_filter_nones_and_nans_retained_in_string_column(lmdb_version_store_v1): +def test_filter_nones_and_nans_retained_in_string_column(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_filter_nones_and_nans_retained_in_string_column" df = pd.DataFrame({"filter_column": [1, 2, 1, 2, 1, 2], "string_column": ["1", "2", np.nan, "4", None, "6"]}) lib.write(sym, df) @@ -493,8 +520,9 @@ def test_filter_nones_and_nans_retained_in_string_column(lmdb_version_store_v1): # Tests that false matches aren't generated when list members truncate to column values -def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1): +def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_fixed_width_string_isin_truncation" df = pd.DataFrame({"a": ["1"]}, index=np.arange(1)) lib.write(symbol, df, dynamic_strings=False) @@ -505,7 +533,7 @@ def test_filter_fixed_width_string_isin_truncation(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): +def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment, any_output_format): # Construct a dataframe and QueryBuilder pair with the following properties: # - original dataframe spanning multiple segments horizontally and vertically (tiny segment == 2x2) # - strings of varying lengths to exercise fixed width strings more completely @@ -514,6 +542,7 @@ def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): # - at least one segment will need none of the strings in it's pool after filtering # - at least one segment will need some, but not all of the strings in it's pool after filtering lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_stringpool_shrinking_basic" df = pd.DataFrame( { @@ -532,8 +561,9 @@ def test_filter_stringpool_shrinking_basic(lmdb_version_store_tiny_segment): generic_filter_test_strings(lib, base_symbol, q, expected) -def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1): +def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_stringpool_shrinking_block_alignment" # Create a dataframe with more than one block (3968 bytes) worth of strings for the stringpool string_length = 10 @@ -549,8 +579,9 @@ def test_filter_stringpool_shrinking_block_alignment(lmdb_version_store_v1): generic_filter_test_strings(lib, base_symbol, q, expected) -def test_filter_explicit_type_promotion(lmdb_version_store_v1): +def test_filter_explicit_type_promotion(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( { "uint8": [np.iinfo(np.uint8).min, np.iinfo(np.uint8).max], @@ -632,8 +663,9 @@ def test_filter_explicit_type_promotion(lmdb_version_store_v1): assert np.array_equal(lib.read(symbol, query_builder=q).data, df.loc[[1]]) -def test_filter_column_slicing_different_segments(lmdb_version_store_tiny_segment): +def test_filter_column_slicing_different_segments(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": np.arange(0, 10), "b": np.arange(10, 20), "c": np.arange(20, 30)}, index=np.arange(10)) symbol = "test_filter_column_slicing_different_segments" lib.write(symbol, df) @@ -661,8 +693,9 @@ def test_filter_column_slicing_different_segments(lmdb_version_store_tiny_segmen assert np.array_equal(expected, received) -def test_filter_with_multi_index(lmdb_version_store_v1): +def test_filter_with_multi_index(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_with_multi_index" dt1 = datetime(2019, 4, 8, 10, 5, 2, 1) dt2 = datetime(2019, 4, 9, 10, 5, 2, 1) @@ -678,8 +711,9 @@ def test_filter_with_multi_index(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_on_multi_index(lmdb_version_store_v1): +def test_filter_on_multi_index(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_on_multi_index" dt1 = datetime(2019, 4, 8, 10, 5, 2, 1) dt2 = datetime(2019, 4, 9, 10, 5, 2, 1) @@ -695,8 +729,9 @@ def test_filter_on_multi_index(lmdb_version_store_v1): generic_filter_test(lib, symbol, q, expected) -def test_filter_complex_expression(lmdb_version_store_tiny_segment): +def test_filter_complex_expression(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_complex_expression" df = pd.DataFrame( { @@ -713,8 +748,9 @@ def test_filter_complex_expression(lmdb_version_store_tiny_segment): generic_filter_test(lib, symbol, q, expected) -def test_filter_string_backslash(lmdb_version_store_v1): +def test_filter_string_backslash(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": ["", "\\"]}, index=np.arange(2)) q = QueryBuilder() q = q[q["a"] == "\\"] @@ -725,8 +761,9 @@ def test_filter_string_backslash(lmdb_version_store_v1): assert np.array_equal(expected, received) -def test_filter_string_single_quote(lmdb_version_store_v1): +def test_filter_string_single_quote(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": ["", "'"]}, index=np.arange(2)) q = QueryBuilder() q = q[q["a"] == "'"] @@ -737,8 +774,9 @@ def test_filter_string_single_quote(lmdb_version_store_v1): assert np.array_equal(expected, received) -def test_filter_string_less_than(lmdb_version_store_v1): +def test_filter_string_less_than(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_less_than" df = pd.DataFrame({"a": ["row1", "row2"]}, index=np.arange(2)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -751,8 +789,9 @@ def test_filter_string_less_than(lmdb_version_store_v1): lib.read(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", query_builder=q).data -def test_filter_string_less_than_equal(lmdb_version_store_v1): +def test_filter_string_less_than_equal(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_less_than_equal" df = pd.DataFrame({"a": ["row1", "row2"]}, index=np.arange(2)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -765,8 +804,9 @@ def test_filter_string_less_than_equal(lmdb_version_store_v1): lib.read(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", query_builder=q).data -def test_filter_string_greater_than(lmdb_version_store_v1): +def test_filter_string_greater_than(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_greater_than" df = pd.DataFrame({"a": ["row1", "row2"]}, index=np.arange(2)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -779,8 +819,9 @@ def test_filter_string_greater_than(lmdb_version_store_v1): lib.read(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", query_builder=q).data -def test_filter_string_greater_than_equal(lmdb_version_store_v1): +def test_filter_string_greater_than_equal(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_greater_than_equal" df = pd.DataFrame({"a": ["row1", "row2"]}, index=np.arange(2)) lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) @@ -793,8 +834,9 @@ def test_filter_string_greater_than_equal(lmdb_version_store_v1): lib.read(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", query_builder=q).data -def test_filter_string_nans_col_val(lmdb_version_store_v1): +def test_filter_string_nans_col_val(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_string_nans_col_val" df = pd.DataFrame({"a": ["row1", "row2", None, np.nan, math.nan]}, index=np.arange(5)) lib.write(symbol, df, dynamic_strings=True) @@ -830,8 +872,9 @@ def test_filter_string_nans_col_val(lmdb_version_store_v1): generic_filter_test_nans(lib, symbol, q, expected) -def test_filter_string_nans_col_col(lmdb_version_store_v1): +def test_filter_string_nans_col_col(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_string_nans_col_col" # Compare all combinations of string, None, np.nan, and math.nan to one another df = pd.DataFrame( @@ -856,8 +899,9 @@ def test_filter_string_nans_col_col(lmdb_version_store_v1): @pytest.mark.parametrize("method", ("isna", "notna", "isnull", "notnull")) @pytest.mark.parametrize("dtype", (np.int64, np.float32, np.float64, np.datetime64, str)) -def test_filter_null_filtering(lmdb_version_store_v1, method, dtype): +def test_filter_null_filtering(lmdb_version_store_v1, method, dtype, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_null_filtering" num_rows = 5 if dtype is np.int64: @@ -1040,8 +1084,9 @@ def _clear(first, second): assert not errors -def test_filter_string_number_comparison(lmdb_version_store_v1): +def test_filter_string_number_comparison(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_string_number_comparison" lib.write(symbol, pd.DataFrame({"a": [0], "b": ["hello"]})) q = QueryBuilder() @@ -1070,8 +1115,9 @@ def test_filter_string_number_comparison(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_filter_string_number_set_membership(lmdb_version_store_v1): +def test_filter_string_number_set_membership(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_string_number_set_membership" lib.write(symbol, pd.DataFrame({"a": [0], "b": ["hello"]})) q = QueryBuilder() @@ -1086,8 +1132,9 @@ def test_filter_string_number_set_membership(lmdb_version_store_v1): # float32 comparisons are excluded from the hypothesis tests due to a bug in Pandas, so cover these here instead # https://github.com/pandas-dev/pandas/issues/59524 -def test_float32_binary_comparison(lmdb_version_store_v1): +def test_float32_binary_comparison(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_float32_binary_comparison" df = pd.DataFrame( { @@ -1177,8 +1224,9 @@ def test_head_tail_unfilterable_data(lmdb_version_store_v1, head, sym, data): @pytest.mark.parametrize("lib_type", ["lmdb_version_store_v1", "lmdb_version_store_dynamic_schema_v1"]) -def test_filter_pickled_symbol(request, lib_type): +def test_filter_pickled_symbol(request, lib_type, any_output_format): lib = request.getfixturevalue(lib_type) + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_pickled_symbol" lib.write(symbol, np.arange(100).tolist()) assert lib.is_symbol_pickled(symbol) @@ -1189,8 +1237,9 @@ def test_filter_pickled_symbol(request, lib_type): @pytest.mark.parametrize("lib_type", ["lmdb_version_store_v1", "lmdb_version_store_dynamic_schema_v1"]) -def test_filter_date_range_pickled_symbol(request, lib_type): +def test_filter_date_range_pickled_symbol(request, lib_type, any_output_format): lib = request.getfixturevalue(lib_type) + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_date_range_pickled_symbol" idx = pd.date_range("2000-01-01", periods=4) df = pd.DataFrame({"a": [[1, 2], [3, 4], [5, 6], [7, 8]]}, index=idx) @@ -1200,8 +1249,9 @@ def test_filter_date_range_pickled_symbol(request, lib_type): lib.read(symbol, date_range=(idx[1], idx[2])) -def test_filter_date_range_none_none(lmdb_version_store_v1): +def test_filter_date_range_none_none(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "sym" idx = pd.date_range("2000-01-01", periods=4) df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=idx) @@ -1215,8 +1265,9 @@ def test_filter_date_range_none_none(lmdb_version_store_v1): ################################## -def test_numeric_filter_dynamic_schema(lmdb_version_store_tiny_segment_dynamic): +def test_numeric_filter_dynamic_schema(lmdb_version_store_tiny_segment_dynamic, any_output_format): lib = lmdb_version_store_tiny_segment_dynamic + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_numeric_filter_dynamic_schema" df = get_wide_dataframe(100) expected, slices = make_dynamic(df) @@ -1233,8 +1284,9 @@ def test_numeric_filter_dynamic_schema(lmdb_version_store_tiny_segment_dynamic): assert_frame_equal(expected, received) -def test_filter_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_filter_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_column_not_present_dynamic" df = pd.DataFrame({"a": np.arange(2)}, index=np.arange(2), dtype="int64") q = QueryBuilder() @@ -1245,8 +1297,9 @@ def test_filter_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1) vit = lib.read(symbol, query_builder=q) -def test_filter_column_present_in_some_segments(lmdb_version_store_dynamic_schema_v1): +def test_filter_column_present_in_some_segments(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_column_not_present_dynamic" df = pd.DataFrame({"a": np.arange(2)}, dtype="int64") lib.write(symbol, df) @@ -1261,8 +1314,9 @@ def test_filter_column_present_in_some_segments(lmdb_version_store_dynamic_schem assert_frame_equal(result, pd.DataFrame({"a": [0], "b": [1]})) -def test_filter_column_type_change(lmdb_version_store_dynamic_schema_v1): +def test_filter_column_type_change(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_column_type_change" # Write a column of float type @@ -1297,8 +1351,9 @@ def test_filter_column_type_change(lmdb_version_store_dynamic_schema_v1): @pytest.mark.parametrize("method", ("isna", "notna", "isnull", "notnull")) @pytest.mark.parametrize("dtype", (np.int64, np.float32, np.float64, np.datetime64, str)) -def test_filter_null_filtering_dynamic(lmdb_version_store_dynamic_schema_v1, method, dtype): +def test_filter_null_filtering_dynamic(lmdb_version_store_dynamic_schema_v1, method, dtype, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "lmdb_version_store_dynamic_schema" num_rows = 3 if dtype is np.int64: @@ -1345,8 +1400,9 @@ def test_filter_null_filtering_dynamic(lmdb_version_store_dynamic_schema_v1, met # Defrag removes column slicing and therefore basically makes any symbol dynamic -def test_filter_with_column_slicing_defragmented(lmdb_version_store_tiny_segment): +def test_filter_with_column_slicing_defragmented(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_with_column_slicing_defragmented" with config_context("SymbolDataCompact.SegmentCount", 0): df = pd.DataFrame( @@ -1384,8 +1440,9 @@ def test_filter_unsupported_boolean_operators(): @pytest.mark.parametrize("dynamic_strings", [True, False]) -def test_filter_regex_match_basic(lmdb_version_store_v1, sym, dynamic_strings): +def test_filter_regex_match_basic(lmdb_version_store_v1, sym, dynamic_strings, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["abc", "abcd", "aabc"], "b": [1, 2, 3], "c": ["12a", "q34c", "567f"]}, @@ -1424,8 +1481,9 @@ def test_filter_regex_match_basic(lmdb_version_store_v1, sym, dynamic_strings): @pytest.mark.parametrize("dynamic_strings", [True, False]) -def test_filter_regex_match_empty_match(lmdb_version_store_v1, sym, dynamic_strings): +def test_filter_regex_match_empty_match(lmdb_version_store_v1, sym, dynamic_strings, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["abc", "abcd", "aabc"], "b": [1, 2, 3], "c": ["12a", "q34c", "567f"]}, @@ -1447,8 +1505,9 @@ def test_filter_regex_match_empty_match(lmdb_version_store_v1, sym, dynamic_stri assert lib.read(sym, query_builder=q2).data.empty -def test_filter_regex_match_nans_nones(lmdb_version_store_v1, sym): +def test_filter_regex_match_nans_nones(lmdb_version_store_v1, sym, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=4), data={"a": ["abc", None, "aabc", np.nan], "b": [1, 2, 3, 4], "c": [np.nan, "q34c", None, "567f"]}, @@ -1468,7 +1527,7 @@ def test_filter_regex_match_nans_nones(lmdb_version_store_v1, sym): assert_frame_equal(lib.read(sym, query_builder=q_c).data, expected) -def test_filter_regex_match_invalid_pattern(lmdb_version_store_v1, sym): +def test_filter_regex_match_invalid_pattern(lmdb_version_store_v1, sym, any_output_format): with pytest.raises(InternalException): # Pending changing exception type to UserInputException in v6.0.0 release q = QueryBuilder() q = q[q["a"].regex_match("[")] @@ -1478,8 +1537,9 @@ def test_filter_regex_match_invalid_pattern(lmdb_version_store_v1, sym): q = q[q["b"].regex_match(1)] -def test_filter_regex_match_uncompatible_column(lmdb_version_store_v1, sym): +def test_filter_regex_match_uncompatible_column(lmdb_version_store_v1, sym, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["abc", "abcd", "aabc"], "b": [1, 2, 3]} ) @@ -1492,8 +1552,9 @@ def test_filter_regex_match_uncompatible_column(lmdb_version_store_v1, sym): @pytest.mark.parametrize("dynamic_strings", [True, False]) -def test_filter_regex_match_unicode(lmdb_version_store_v1, sym, dynamic_strings): +def test_filter_regex_match_unicode(lmdb_version_store_v1, sym, dynamic_strings, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": [f"{unicode_symbols}abc", f"abc{unicode_symbols}", "abc"], "b": [1, 2, 3]}, @@ -1510,8 +1571,9 @@ def test_filter_regex_match_unicode(lmdb_version_store_v1, sym, dynamic_strings) @pytest.mark.parametrize("dynamic_strings", [True, False]) -def test_filter_regex_comma_separated_strings(lmdb_version_store_v1, sym, dynamic_strings): +def test_filter_regex_comma_separated_strings(lmdb_version_store_v1, sym, dynamic_strings, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["a-1,d-1", "g-i,3-l", "d-2,-hi"], "b": [1, 2, 3]} ) diff --git a/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py b/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py index 7165351f7a..2007f28a4f 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering_hypothesis.py @@ -38,7 +38,7 @@ ) -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered @use_of_function_scoped_fixtures_in_hypothesis_checked @@ -49,9 +49,10 @@ ), val=numeric_type_strategies(), ) -def test_filter_numeric_binary_comparison(lmdb_version_store_v1, df, val): +def test_filter_numeric_binary_comparison(lmdb_version_store_v1, any_output_format, df, val): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_binary_comparison" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -92,9 +93,10 @@ def test_filter_numeric_binary_comparison(lmdb_version_store_v1, df, val): ), val=string_strategy, ) -def test_filter_string_binary_comparison(lmdb_version_store_v1, df, val): +def test_filter_string_binary_comparison(lmdb_version_store_v1, any_output_format, df, val): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_binary_comparison" lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) lib.write(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", df, dynamic_strings=False) @@ -119,9 +121,10 @@ def test_filter_string_binary_comparison(lmdb_version_store_v1, df, val): signed_vals=st.frozensets(signed_integral_type_strategies(), min_size=1), unsigned_vals=st.frozensets(unsigned_integral_type_strategies(), min_size=1), ) -def test_filter_numeric_set_membership(lmdb_version_store_v1, df, signed_vals, unsigned_vals): +def test_filter_numeric_set_membership(lmdb_version_store_v1, any_output_format, df, signed_vals, unsigned_vals): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_set_membership" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -140,9 +143,10 @@ def test_filter_numeric_set_membership(lmdb_version_store_v1, df, signed_vals, u df=dataframe_strategy([column_strategy("a", supported_string_dtypes())]), vals=st.frozensets(string_strategy, min_size=1), ) -def test_filter_string_set_membership(lmdb_version_store_v1, df, vals): +def test_filter_string_set_membership(lmdb_version_store_v1, any_output_format, df, vals): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_set_membership" lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) lib.write(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", df, dynamic_strings=False) @@ -158,9 +162,10 @@ def test_filter_string_set_membership(lmdb_version_store_v1, df, vals): @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_integer_dtypes())])) -def test_filter_numeric_empty_set_membership(lmdb_version_store_v1, df): +def test_filter_numeric_empty_set_membership(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_empty_set_membership" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -175,9 +180,10 @@ def test_filter_numeric_empty_set_membership(lmdb_version_store_v1, df): @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_string_dtypes())])) -def test_filter_string_empty_set_membership(lmdb_version_store_v1, df): +def test_filter_string_empty_set_membership(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_empty_set_membership" lib.write(f"{base_symbol}_{DYNAMIC_STRINGS_SUFFIX}", df, dynamic_strings=True) lib.write(f"{base_symbol}_{FIXED_STRINGS_SUFFIX}", df, dynamic_strings=False) @@ -197,8 +203,9 @@ def test_filter_string_empty_set_membership(lmdb_version_store_v1, df): df_dt=st.datetimes(min_value=datetime(2020, 1, 1), max_value=datetime(2022, 1, 1), timezones=timezone_st()), comparison_dt=st.datetimes(min_value=datetime(2020, 1, 1), max_value=datetime(2022, 1, 1), timezones=timezone_st()), ) -def test_filter_datetime_timezone_aware_hypothesis(lmdb_version_store_v1, df_dt, comparison_dt): +def test_filter_datetime_timezone_aware_hypothesis(lmdb_version_store_v1, any_output_format, df_dt, comparison_dt): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_datetime_timezone_aware_hypothesis" df = pd.DataFrame({"a": [df_dt]}) lib.write(symbol, df) @@ -227,9 +234,10 @@ def test_filter_datetime_timezone_aware_hypothesis(lmdb_version_store_v1, df_dt, [column_strategy("a", supported_numeric_dtypes()), column_strategy("b", supported_numeric_dtypes())] ) ) -def test_filter_binary_boolean(lmdb_version_store_v1, df): +def test_filter_binary_boolean(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_binary_boolean" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -254,9 +262,10 @@ def test_filter_binary_boolean(lmdb_version_store_v1, df): df=dataframe_strategy([column_strategy("a", supported_numeric_dtypes())]), val=numeric_type_strategies(), ) -def test_filter_not(lmdb_version_store_v1, df, val): +def test_filter_not(lmdb_version_store_v1, any_output_format, df, val): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_not" lib.write(symbol, df) q = QueryBuilder() @@ -276,9 +285,10 @@ def test_filter_not(lmdb_version_store_v1, df, val): ] ), ) -def test_filter_more_columns_than_fit_in_one_segment(lmdb_version_store_tiny_segment, df): +def test_filter_more_columns_than_fit_in_one_segment(lmdb_version_store_tiny_segment, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_more_columns_than_fit_in_one_segment" lib.write(symbol, df) q = QueryBuilder() @@ -328,9 +338,10 @@ def test_filter_with_column_slicing(lmdb_version_store_tiny_segment, df): ), val=numeric_type_strategies(), ) -def test_filter_numeric_binary_comparison_dynamic(lmdb_version_store_dynamic_schema_v1, df, val): +def test_filter_numeric_binary_comparison_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df, val): assume(len(df) >= 3) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_binary_comparison_dynamic" lib.delete(symbol) slices = [ @@ -437,9 +448,10 @@ def test_filter_numeric_binary_comparison_dynamic(lmdb_version_store_dynamic_sch ), val=string_strategy, ) -def test_filter_string_binary_comparison_dynamic(lmdb_version_store_dynamic_schema_v1, df, val): +def test_filter_string_binary_comparison_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df, val): assume(len(df) >= 3) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_binary_comparison_dynamic" slices = [ @@ -477,9 +489,12 @@ def test_filter_string_binary_comparison_dynamic(lmdb_version_store_dynamic_sche signed_vals=st.frozensets(signed_integral_type_strategies(), min_size=1), unsigned_vals=st.frozensets(unsigned_integral_type_strategies(), min_size=1), ) -def test_filter_numeric_set_membership_dynamic(lmdb_version_store_dynamic_schema_v1, df, signed_vals, unsigned_vals): +def test_filter_numeric_set_membership_dynamic( + lmdb_version_store_dynamic_schema_v1, df, signed_vals, unsigned_vals, any_output_format +): assume(len(df) >= 2) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_numeric_set_membership_dynamic" lib.delete(symbol) slices = [ @@ -512,9 +527,10 @@ def test_filter_numeric_set_membership_dynamic(lmdb_version_store_dynamic_schema df=dataframe_strategy([column_strategy("a", supported_string_dtypes())]), vals=st.frozensets(string_strategy, min_size=1), ) -def test_filter_string_set_membership_dynamic(lmdb_version_store_dynamic_schema_v1, df, vals): +def test_filter_string_set_membership_dynamic(lmdb_version_store_dynamic_schema_v1, df, vals, any_output_format): assume(len(df) >= 2) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) base_symbol = "test_filter_string_set_membership_dynamic" slices = [ df[: len(df) // 2], diff --git a/python/tests/unit/arcticdb/version_store/test_head.py b/python/tests/unit/arcticdb/version_store/test_head.py index 2ec3d549a8..87c5958187 100644 --- a/python/tests/unit/arcticdb/version_store/test_head.py +++ b/python/tests/unit/arcticdb/version_store/test_head.py @@ -15,7 +15,7 @@ from arcticdb_ext.exceptions import InternalException -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered def generic_head_test(version_store, symbol, df, num_rows): @@ -23,65 +23,79 @@ def generic_head_test(version_store, symbol, df, num_rows): assert np.array_equal(df.head(num_rows), version_store.head(symbol, num_rows).data) -def test_head_large_segment(lmdb_version_store): +def test_head_large_segment(lmdb_version_store, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) df = DataFrame({"x": np.arange(100_000, dtype=np.int64)}) generic_head_test(lmdb_version_store, "test_head_large_segment", df, 50_000) -def test_head_zero_num_rows(lmdb_version_store, one_col_df): +def test_head_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) generic_head_test(lmdb_version_store, "test_head_zero_num_rows", one_col_df(), 0) -def test_head_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_one_num_rows", one_col_df(), 1) -def test_head_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_head_test(lmdb_version_store_tiny_segment, "test_head_segment_boundary_num_rows", one_col_df(), 2) -def test_head_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_head_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_head_test(lmdb_version_store_tiny_segment, "test_head_multiple_segments", one_col_df(), 7) -def test_head_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_head_test(lmdb_version_store_tiny_segment, "test_head_negative_num_rows", one_col_df(), -7) -def test_head_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_head_test(lmdb_version_store_tiny_segment, "test_head_num_rows_greater_than_table_length", one_col_df(), 10) -def test_head_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_head_test( lmdb_version_store_tiny_segment, "test_head_negative_num_rows_equals_table_length", one_col_df(), -10 ) -def test_head_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_head_test(lmdb_version_store_tiny_segment, "test_head_num_rows_greater_than_table_length", one_col_df(), 11) -def test_head_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_head_negative_num_rows_greater_than_table_length( + lmdb_version_store_tiny_segment, one_col_df, any_output_format +): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_head_test( lmdb_version_store_tiny_segment, "test_head_negative_num_rows_greater_than_table_length", one_col_df(), -11 ) -def test_head_default_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_head_default_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_head_default_num_rows" lmdb_version_store_tiny_segment.write(symbol, one_col_df()) num_rows = signature(lmdb_version_store_tiny_segment.head).parameters["n"].default assert np.array_equal(one_col_df().head(num_rows), lmdb_version_store_tiny_segment.head(symbol).data) -def test_head_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): +def test_head_with_column_filter(lmdb_version_store_tiny_segment, three_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_head_with_column_filter" lmdb_version_store_tiny_segment.write(symbol, three_col_df()) # lmdb_version_store_tiny_segment has column_group_size set to 2 @@ -94,7 +108,8 @@ def test_head_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): ) -def test_head_pickled_symbol(lmdb_version_store): +def test_head_pickled_symbol(lmdb_version_store, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_head_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) @@ -103,8 +118,9 @@ def test_head_pickled_symbol(lmdb_version_store): @pytest.mark.parametrize("n", range(6)) -def test_dynamic_schema_head(lmdb_version_store_dynamic_schema, n): +def test_dynamic_schema_head(lmdb_version_store_dynamic_schema, n, any_output_format): lib = lmdb_version_store_dynamic_schema + lib._set_output_format_for_pipeline_tests(any_output_format) lib.write("sym", DataFrame({"a": [1, 2]}, index=[0, 1])) lib.append("sym", DataFrame({"b": [5, 6]}, index=[2, 3])) result = lib.head("sym", n=n).data diff --git a/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py b/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py index 3d22435362..e8111b8bf9 100644 --- a/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py +++ b/python/tests/unit/arcticdb/version_store/test_lazy_dataframe.py @@ -15,11 +15,12 @@ from arcticdb.util.test import assert_frame_equal -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered -def test_lazy_read(lmdb_library): +def test_lazy_read(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_read" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -40,8 +41,9 @@ def test_lazy_read(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_date_range(lmdb_library): +def test_lazy_date_range(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_date_range" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -57,8 +59,9 @@ def test_lazy_date_range(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_filter(lmdb_library): +def test_lazy_filter(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_filter" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -74,8 +77,9 @@ def test_lazy_filter(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_head(lmdb_library): +def test_lazy_head(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_head" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -91,8 +95,9 @@ def test_lazy_head(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_tail(lmdb_library): +def test_lazy_tail(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_tail" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -108,8 +113,9 @@ def test_lazy_tail(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_apply(lmdb_library): +def test_lazy_apply(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_apply" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -126,8 +132,9 @@ def test_lazy_apply(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_apply_inline_col(lmdb_library): +def test_lazy_apply_inline_col(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_apply_inline_col" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -143,8 +150,9 @@ def test_lazy_apply_inline_col(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_project(lmdb_library): +def test_lazy_project(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_project" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -161,8 +169,9 @@ def test_lazy_project(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_project_constant_value(lmdb_library): +def test_lazy_project_constant_value(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_project" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -179,8 +188,9 @@ def test_lazy_project_constant_value(lmdb_library): assert_frame_equal(expected, received, check_dtype=False) -def test_lazy_ternary(lmdb_library): +def test_lazy_ternary(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_ternary" df = pd.DataFrame( { @@ -201,8 +211,9 @@ def test_lazy_ternary(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_groupby(lmdb_library): +def test_lazy_groupby(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_groupby" df = pd.DataFrame({"col1": [0, 1, 0, 1, 2, 2], "col2": np.arange(6, dtype=np.int64)}) lib.write(sym, df) @@ -216,8 +227,9 @@ def test_lazy_groupby(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_resample(lmdb_library): +def test_lazy_resample(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_resample" df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -235,8 +247,9 @@ def test_lazy_resample(lmdb_library): assert_frame_equal(expected, received) -def test_lazy_regex_match(lmdb_library, sym): +def test_lazy_regex_match(lmdb_library, sym, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( index=pd.date_range(pd.Timestamp(0), periods=3), data={"a": ["abc", "abcd", "aabc"], "b": [1, 2, 3]} ) @@ -251,8 +264,9 @@ def test_lazy_regex_match(lmdb_library, sym): assert_frame_equal(expected, received) -def test_lazy_with_initial_query_builder(lmdb_library): +def test_lazy_with_initial_query_builder(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_chaining" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -270,8 +284,9 @@ def test_lazy_with_initial_query_builder(lmdb_library): assert_frame_equal(expected, received, check_dtype=False) -def test_lazy_chaining(lmdb_library): +def test_lazy_chaining(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_chaining" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -287,8 +302,9 @@ def test_lazy_chaining(lmdb_library): assert_frame_equal(expected, received, check_dtype=False) -def test_lazy_batch_read(lmdb_library): +def test_lazy_batch_read(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym_0 = "test_lazy_batch_read_0" sym_1 = "test_lazy_batch_read_1" df = pd.DataFrame( @@ -317,8 +333,9 @@ def test_lazy_batch_read(lmdb_library): assert_frame_equal(expected_1, received[1].data) -def test_lazy_batch_one_query(lmdb_library): +def test_lazy_batch_one_query(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_one_query_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -334,8 +351,9 @@ def test_lazy_batch_one_query(lmdb_library): assert_frame_equal(expected, vit.data) -def test_lazy_batch_collect_separately(lmdb_library): +def test_lazy_batch_collect_separately(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_collect_separately_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -358,8 +376,9 @@ def test_lazy_batch_collect_separately(lmdb_library): assert_frame_equal(expected_2, received_2) -def test_lazy_batch_separate_queries_collect_together(lmdb_library): +def test_lazy_batch_separate_queries_collect_together(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_separate_queries_collect_together_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -380,8 +399,9 @@ def test_lazy_batch_separate_queries_collect_together(lmdb_library): assert_frame_equal(expected_2, received[2].data) -def test_lazy_batch_complex(lmdb_library): +def test_lazy_batch_complex(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_complex_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -420,8 +440,9 @@ def test_lazy_batch_complex(lmdb_library): assert_frame_equal(expected_2, received[2].data) -def test_lazy_collect_multiple_times(lmdb_library): +def test_lazy_collect_multiple_times(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_collect_multiple_times" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -442,8 +463,9 @@ def test_lazy_collect_multiple_times(lmdb_library): assert_frame_equal(expected, received_2, check_dtype=False) -def test_lazy_batch_collect_multiple_times(lmdb_library): +def test_lazy_batch_collect_multiple_times(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_collect_multiple_times_{idx}" for idx in range(3)] df = pd.DataFrame( {"col1": np.arange(10, dtype=np.int64), "col2": np.arange(100, 110, dtype=np.int64)}, @@ -469,8 +491,9 @@ def test_lazy_batch_collect_multiple_times(lmdb_library): assert_frame_equal(expected, vit.data) -def test_lazy_collect_twice_with_date_range(lmdb_library): +def test_lazy_collect_twice_with_date_range(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_collect_twice_with_date_range" df = pd.DataFrame( { @@ -488,8 +511,9 @@ def test_lazy_collect_twice_with_date_range(lmdb_library): assert_frame_equal(expected, received_1, check_dtype=False) -def test_lazy_pickling(lmdb_library): +def test_lazy_pickling(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_lazy_pickling" idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") @@ -511,8 +535,9 @@ def test_lazy_pickling(lmdb_library): assert_frame_equal(expected, received_roundtripped, check_dtype=False) -def test_lazy_batch_pickling(lmdb_library): +def test_lazy_batch_pickling(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) syms = [f"test_lazy_batch_pickling_{idx}" for idx in range(3)] idx = [0, 1, 2, 3, 1000, 1001] idx = np.array(idx, dtype="datetime64[ns]") diff --git a/python/tests/unit/arcticdb/version_store/test_projection.py b/python/tests/unit/arcticdb/version_store/test_projection.py index f535a82aee..d858347742 100644 --- a/python/tests/unit/arcticdb/version_store/test_projection.py +++ b/python/tests/unit/arcticdb/version_store/test_projection.py @@ -16,11 +16,12 @@ from arcticdb.util.test import assert_frame_equal, make_dynamic, regularize_dataframe -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered -def test_project_column_not_present(lmdb_version_store_v1): +def test_project_column_not_present(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"a": np.arange(2)}, index=np.arange(2)) q = QueryBuilder() q = q.apply("new", q["b"] + 1) @@ -30,8 +31,9 @@ def test_project_column_not_present(lmdb_version_store_v1): _ = lib.read(symbol, query_builder=q) -def test_project_string_binary_arithmetic(lmdb_version_store_v1): +def test_project_string_binary_arithmetic(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_string_arithmetic" lib.write(symbol, pd.DataFrame({"col_a": [0], "col_b": ["hello"], "col_c": ["bonjour"]})) operands = ["col_a", "col_b", "col_c", "0", 0] @@ -53,8 +55,9 @@ def test_project_string_binary_arithmetic(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_project_string_unary_arithmetic(lmdb_version_store_v1): +def test_project_string_unary_arithmetic(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_string_unary_arithmetic" lib.write(symbol, pd.DataFrame({"a": ["hello"]})) q = QueryBuilder() @@ -69,8 +72,9 @@ def test_project_string_unary_arithmetic(lmdb_version_store_v1): @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=3)]) @pytest.mark.parametrize("value", [5, "hello"]) -def test_project_fixed_value(lmdb_version_store_tiny_segment, index, value): +def test_project_fixed_value(lmdb_version_store_tiny_segment, index, value, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_project_fixed_value" df = pd.DataFrame({"col1": [0, 1, 2], "col2": [3, 4, 5], "col3": [6, 7, 8]}, index=index) lib.write(sym, df) @@ -85,8 +89,9 @@ def test_project_value_set(): QueryBuilder().apply("new_col", [0, 1, 2]) -def test_docstring_example_query_builder_apply(lmdb_version_store_v1): +def test_docstring_example_query_builder_apply(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( { "VWAP": np.arange(0, 10, dtype=np.float64), @@ -111,8 +116,9 @@ def test_docstring_example_query_builder_apply(lmdb_version_store_v1): ################################## -def test_project_dynamic(lmdb_version_store_dynamic_schema_v1): +def test_project_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_dynamic" df = pd.DataFrame( @@ -138,8 +144,9 @@ def test_project_dynamic(lmdb_version_store_dynamic_schema_v1): assert_frame_equal(expected, received) -def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_schema): +def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_schema, any_output_format): lib = lmdb_version_store_dynamic_schema + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_column_types_changing_and_missing" # Floats expected = pd.DataFrame({"col_to_project": [0.5, 1.5], "data_col": [0, 1]}, index=np.arange(0, 2)) @@ -168,8 +175,9 @@ def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_sc @pytest.mark.parametrize("index", [None, "timeseries"]) @pytest.mark.parametrize("value", [5, "hello"]) -def test_project_fixed_value_dynamic(lmdb_version_store_dynamic_schema_v1, index, value): +def test_project_fixed_value_dynamic(lmdb_version_store_dynamic_schema_v1, index, value, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_project_fixed_value_dynamic" df0 = pd.DataFrame( {"col1": [0, 0.1, 0.2], "col2": [0.3, 0.4, 0.5]}, diff --git a/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py b/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py index 93c10b2688..aa458b84cc 100644 --- a/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py +++ b/python/tests/unit/arcticdb/version_store/test_projection_hypothesis.py @@ -23,7 +23,7 @@ ) -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered @use_of_function_scoped_fixtures_in_hypothesis_checked @@ -37,9 +37,10 @@ ), val=numeric_type_strategies(), ) -def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val): +def test_project_numeric_binary_operation(lmdb_version_store_v1, any_output_format, df, val): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_numeric_binary_operation" lib.write(symbol, df) # Would be cleaner to use pytest.parametrize, but the expensive bit is generating/writing the df, so make sure we @@ -80,9 +81,10 @@ def test_project_numeric_binary_operation(lmdb_version_store_v1, df, val): @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_numeric_dtypes(), restrict_range=True)])) -def test_project_numeric_unary_operation(lmdb_version_store_v1, df): +def test_project_numeric_unary_operation(lmdb_version_store_v1, any_output_format, df): assume(not df.empty) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_numeric_unary_operation" lib.write(symbol, df) q = QueryBuilder() @@ -122,9 +124,10 @@ def test_project_numeric_unary_operation(lmdb_version_store_v1, df): ), val=numeric_type_strategies(), ) -def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df, val): +def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df, val): assume(len(df) >= 3) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_numeric_binary_operation_dynamic" lib.delete(symbol) slices = [ @@ -172,9 +175,10 @@ def test_project_numeric_binary_operation_dynamic(lmdb_version_store_dynamic_sch @use_of_function_scoped_fixtures_in_hypothesis_checked @settings(deadline=None) @given(df=dataframe_strategy([column_strategy("a", supported_floating_dtypes(), restrict_range=True)])) -def test_project_numeric_unary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, df): +def test_project_numeric_unary_operation_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df): assume(len(df) >= 2) lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_numeric_unary_operation_dynamic" lib.delete(symbol) slices = [ diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder.py b/python/tests/unit/arcticdb/version_store/test_query_builder.py index e16b8e881b..50bdf1631c 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder.py @@ -22,7 +22,7 @@ from arcticdb.options import OutputFormat import arcticdb.toolbox.query_stats as qs -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered def sort_by_index(df_or_table: Union[pa.Table, pd.DataFrame]): diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py b/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py index 007a802e9a..762c9b485a 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder_batch.py @@ -16,11 +16,12 @@ from arcticdb_ext.exceptions import InternalException, StorageException, UserInputException -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered -def test_filter_batch_one_query(lmdb_version_store_v1): +def test_filter_batch_one_query(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -38,8 +39,9 @@ def test_filter_batch_one_query(lmdb_version_store_v1): assert np.array_equal(df2.query(pandas_query), res2) -def test_filter_batch_multiple_queries(lmdb_version_store_v1): +def test_filter_batch_multiple_queries(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -60,8 +62,9 @@ def test_filter_batch_multiple_queries(lmdb_version_store_v1): assert np.array_equal(df2.query(pandas_query2), res2) -def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1): +def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -79,8 +82,9 @@ def test_filter_batch_multiple_queries_with_none(lmdb_version_store_v1): assert np.array_equal(df2.query(pandas_query2), res2) -def test_filter_batch_incorrect_query_count(lmdb_version_store_v1): +def test_filter_batch_incorrect_query_count(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -96,8 +100,9 @@ def test_filter_batch_incorrect_query_count(lmdb_version_store_v1): lib.batch_read([sym1, sym2], query_builder=[q, q, q]) -def test_filter_batch_symbol_doesnt_exist(lmdb_version_store_v1): +def test_filter_batch_symbol_doesnt_exist(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -108,8 +113,9 @@ def test_filter_batch_symbol_doesnt_exist(lmdb_version_store_v1): lib.batch_read([sym1, sym2], query_builder=q) -def test_filter_batch_version_doesnt_exist(lmdb_version_store_v1): +def test_filter_batch_version_doesnt_exist(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym1 = "sym1" sym2 = "sym2" df1 = pd.DataFrame({"a": [1, 2]}, index=np.arange(2)) @@ -123,8 +129,9 @@ def test_filter_batch_version_doesnt_exist(lmdb_version_store_v1): lib.batch_read([sym1, sym2], as_ofs=[0, 1], query_builder=q) -def test_filter_batch_missing_keys(lmdb_version_store_v1): +def test_filter_batch_missing_keys(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df1 = pd.DataFrame({"a": [3, 5, 7]}) df2 = pd.DataFrame({"a": [4, 6, 8]}) diff --git a/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py b/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py index cde0b4707c..f5f6fa5bc9 100644 --- a/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py +++ b/python/tests/unit/arcticdb/version_store/test_query_builder_sparse.py @@ -17,7 +17,7 @@ from arcticdb.util.hypothesis import use_of_function_scoped_fixtures_in_hypothesis_checked -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered class TestQueryBuilderSparse: @@ -25,8 +25,9 @@ class TestQueryBuilderSparse: df = None @pytest.fixture(autouse=True) - def write_test_data(self, lmdb_version_store): + def write_test_data(self, lmdb_version_store, any_output_format): lib = lmdb_version_store + lib._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "sparse1": [1.0, np.nan, 2.0, np.nan], @@ -145,8 +146,9 @@ def test_groupby(self, lmdb_version_store): assert_frame_equal(expected, received, check_dtype=False) -def test_query_builder_sparse_dynamic_schema_type_change(lmdb_version_store_dynamic_schema): +def test_query_builder_sparse_dynamic_schema_type_change(lmdb_version_store_dynamic_schema, any_output_format): lib = lmdb_version_store_dynamic_schema + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_query_builder_sparse_dynamic_schema_type_change" df_0 = pd.DataFrame( { diff --git a/python/tests/unit/arcticdb/version_store/test_resample.py b/python/tests/unit/arcticdb/version_store/test_resample.py index ef65c7a06e..29213e3d65 100644 --- a/python/tests/unit/arcticdb/version_store/test_resample.py +++ b/python/tests/unit/arcticdb/version_store/test_resample.py @@ -27,7 +27,7 @@ from arcticdb.util._versions import IS_PANDAS_TWO, PANDAS_VERSION import itertools -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] @@ -74,8 +74,9 @@ def generic_resample_test_with_empty_buckets(lib, sym, rule, aggregations, date_ ) @pytest.mark.parametrize("closed", ("left", "right")) @pytest.mark.parametrize("label", ("left", "right")) -def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label): +def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling" # Want an index with data every minute for 2 days, with additional data points 1 nanosecond before and after each # minute to catch off-by-one errors @@ -111,8 +112,9 @@ def test_resampling(lmdb_version_store_v1, freq, date_range, closed, label): @pytest.mark.parametrize("closed", ("left", "right")) -def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_store_v1, closed): +def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_store_v1, closed, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_duplicated_index_value_on_segment_boundary" # Will group on microseconds df_0 = pd.DataFrame({"col": np.arange(4)}, index=np.array([0, 1, 2, 1000], dtype="datetime64[ns]")) @@ -134,8 +136,9 @@ def test_resampling_duplicated_index_value_on_segment_boundary(lmdb_version_stor class TestResamplingBucketInsideSegment: - def test_all_buckets_have_values(self, lmdb_version_store_v1): + def test_all_buckets_have_values(self, lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_inner_buckets_are_empty" start = dt.datetime(2023, 12, 7, 23, 59, 47, 500000) idx = [start + i * pd.Timedelta("1s") for i in range(0, 8)] @@ -146,8 +149,9 @@ def test_all_buckets_have_values(self, lmdb_version_store_v1): generic_resample_test_with_empty_buckets(lib, sym, "s", {"high": ("mid", "max")}, date_range=date_range) @pytest.mark.parametrize("closed", ("left", "right")) - def test_first_bucket_is_empy(self, lmdb_version_store_v1, closed): + def test_first_bucket_is_empy(self, lmdb_version_store_v1, closed, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_first_bucket_is_empy" idx = pd.DatetimeIndex( [ @@ -166,8 +170,9 @@ def test_first_bucket_is_empy(self, lmdb_version_store_v1, closed): generic_resample_test(lib, sym, "s", {"high": ("mid", "max")}, df, date_range=date_range, closed=closed) @pytest.mark.parametrize("closed", ("left", "right")) - def test_last_bucket_is_empty(self, lmdb_version_store_v1, closed): + def test_last_bucket_is_empty(self, lmdb_version_store_v1, closed, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_last_bucket_is_empty" idx = pd.DatetimeIndex( [ @@ -186,8 +191,9 @@ def test_last_bucket_is_empty(self, lmdb_version_store_v1, closed): date_range = (dt.datetime(2023, 12, 7, 23, 59, 48), dt.datetime(2023, 12, 7, 23, 59, 49, 500000)) generic_resample_test(lib, sym, "s", {"high": ("mid", "max")}, df, date_range=date_range, closed=closed) - def test_inner_buckets_are_empty(self, lmdb_version_store_v1): + def test_inner_buckets_are_empty(self, lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_inner_buckets_are_empty" idx = pd.DatetimeIndex( [ @@ -206,8 +212,9 @@ def test_inner_buckets_are_empty(self, lmdb_version_store_v1): generic_resample_test_with_empty_buckets(lib, sym, "s", {"high": ("mid", "max")}, date_range=date_range) -def test_resampling_timezones(lmdb_version_store_v1): +def test_resampling_timezones(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_timezones" # UK clocks go forward at 1am on March 31st in 2024 index = pd.date_range("2024-03-31T00:00:00", freq="min", periods=240, tz="Europe/London") @@ -222,10 +229,11 @@ def test_resampling_timezones(lmdb_version_store_v1): generic_resample_test(lib, sym, "h", {"sum": ("col", "sum")}, df) -def test_resampling_nan_correctness(version_store_factory): +def test_resampling_nan_correctness(version_store_factory, any_output_format): lib = version_store_factory( column_group_size=2, segment_row_size=2, dynamic_strings=True, lmdb_config={"map_size": 2**30} ) + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_nan_correctness" # NaN here means NaT for datetime columns and NaN/None in string columns # Create 5 buckets worth of data, each containing 3 values: @@ -277,8 +285,9 @@ def test_resampling_nan_correctness(version_store_factory): generic_resample_test(lib, sym, "us", agg_dict, df) -def test_resampling_bool_columns(lmdb_version_store_tiny_segment): +def test_resampling_bool_columns(lmdb_version_store_tiny_segment, any_output_format): lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_bool_columns" idx = [0, 1, 1000, 1001, 2000, 2001, 3000, 3001] @@ -306,8 +315,9 @@ def test_resampling_bool_columns(lmdb_version_store_tiny_segment): ) -def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_schema_v1): +def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_dynamic_schema_types_changing" # Will group on microseconds idx_0 = [0, 1, 2, 1000] @@ -339,8 +349,9 @@ def test_resampling_dynamic_schema_types_changing(lmdb_version_store_dynamic_sch ) -def test_resampling_empty_bucket_in_range(lmdb_version_store_v1): +def test_resampling_empty_bucket_in_range(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_empty_bucket_in_range" # Group on microseconds, so bucket 1000-1999 will be empty idx = [0, 1, 2000, 2001] @@ -379,7 +390,7 @@ def test_resampling_empty_bucket_in_range(lmdb_version_store_v1): ) -def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment): +def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny_segment, any_output_format): # Covers a corner case where the date_range argument specifies that a row-slice is needed, but the bucket boundaries # mean that all of the index values required fall into a bucket being handled by the previous row-slice, and so # the call to ResampleClause::process produces a segment with no rows @@ -390,6 +401,7 @@ def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny # Therefore the only index value from the second row slice remaining to be processed is 3000ns. But this is outside # the specified date range, and so this call to ResampleClause::process produces a segment with no rows lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_row_slice_responsible_for_no_buckets" df = pd.DataFrame( { @@ -410,8 +422,9 @@ def test_resampling_row_slice_responsible_for_no_buckets(lmdb_version_store_tiny @pytest.mark.parametrize("tz", (None, "Europe/London")) @pytest.mark.parametrize("named_levels", (True, False)) -def test_resample_multiindex(lmdb_version_store_v1, tz, named_levels): +def test_resample_multiindex(lmdb_version_store_v1, tz, named_levels, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resample_multiindex" multiindex = pd.MultiIndex.from_product( [pd.date_range("2024-01-01", freq="h", periods=5, tz=tz), [0, 1], ["hello", "goodbye"]] @@ -444,8 +457,9 @@ def test_resample_multiindex(lmdb_version_store_v1, tz, named_levels): @pytest.mark.parametrize("use_date_range", (True, False)) @pytest.mark.parametrize("single_query", (True, False)) -def test_resampling_batch_read_query(lmdb_version_store_v1, use_date_range, single_query): +def test_resampling_batch_read_query(lmdb_version_store_v1, use_date_range, single_query, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym_0 = "test_resampling_batch_read_query_0" sym_1 = "test_resampling_batch_read_query_1" @@ -532,8 +546,9 @@ def test_resample_rejects_unsupported_frequency_strings(freq): QueryBuilder().resample(freq + "1h") -def test_resampling_unsupported_aggregation_type_combos(lmdb_version_store_v1): +def test_resampling_unsupported_aggregation_type_combos(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_unsupported_aggregation_type_combos" df = pd.DataFrame({"string": ["hello"], "datetime": [pd.Timestamp(0)]}, index=[pd.Timestamp(0)]) @@ -551,8 +566,9 @@ def test_resampling_unsupported_aggregation_type_combos(lmdb_version_store_v1): lib.read(sym, query_builder=q) -def test_resampling_sparse_data(lmdb_version_store_v1): +def test_resampling_sparse_data(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_sparse_data" # col_1 will be dense, but with fewer rows than the index column, and so semantically sparse @@ -570,8 +586,9 @@ def test_resampling_sparse_data(lmdb_version_store_v1): lib.read(sym, query_builder=q) -def test_resampling_empty_type_column(lmdb_version_store_empty_types_v1): +def test_resampling_empty_type_column(lmdb_version_store_empty_types_v1, any_output_format): lib = lmdb_version_store_empty_types_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_resampling_empty_type_column" lib.write(sym, pd.DataFrame({"col": ["hello"]}, index=[pd.Timestamp(0)])) @@ -594,8 +611,9 @@ def test_resampling_empty_type_column(lmdb_version_store_empty_types_v1): class TestResamplingOffset: @pytest.mark.parametrize("offset", ("30s", pd.Timedelta(seconds=30))) - def test_offset_smaller_than_freq(self, lmdb_version_store_v1, closed, offset): + def test_offset_smaller_than_freq(self, lmdb_version_store_v1, closed, offset, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_offset_smaller_than_freq" idx = pd.date_range(pd.Timestamp("2024-01-02"), pd.Timestamp("2024-01-04"), freq="min") rng = np.random.default_rng() @@ -604,8 +622,9 @@ def test_offset_smaller_than_freq(self, lmdb_version_store_v1, closed, offset): generic_resample_test(lib, sym, "2min", all_aggregations_dict("col"), df, closed=closed, offset="30s") @pytest.mark.parametrize("offset", ("2min37s", pd.Timedelta(minutes=2, seconds=37))) - def test_offset_larger_than_freq(self, lmdb_version_store_v1, closed, offset): + def test_offset_larger_than_freq(self, lmdb_version_store_v1, closed, offset, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_offset_larger_than_freq" idx = pd.date_range(pd.Timestamp("2024-01-02"), pd.Timestamp("2024-01-04"), freq="min") rng = np.random.default_rng() @@ -614,8 +633,9 @@ def test_offset_larger_than_freq(self, lmdb_version_store_v1, closed, offset): generic_resample_test(lib, sym, "2min", all_aggregations_dict("col"), df, closed=closed, offset=offset) @pytest.mark.parametrize("offset", ("30s", pd.Timedelta(seconds=30))) - def test_values_on_offset_boundary(self, lmdb_version_store_v1, closed, offset): + def test_values_on_offset_boundary(self, lmdb_version_store_v1, closed, offset, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_offset_larger_than_freq" start = pd.Timestamp("2024-01-02") end = pd.Timestamp("2024-01-04") @@ -637,8 +657,9 @@ def test_values_on_offset_boundary(self, lmdb_version_store_v1, closed, offset): (dt.datetime(2024, 1, 2, 5, 0, 30, 1), dt.datetime(2024, 1, 3, 5, 0, 29, 999999)), ], ) - def test_with_date_range(self, lmdb_version_store_v1, closed, date_range, offset): + def test_with_date_range(self, lmdb_version_store_v1, closed, date_range, offset, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_offset_larger_than_freq" start = pd.Timestamp("2024-01-02") end = pd.Timestamp("2024-01-04") @@ -676,8 +697,9 @@ class TestResamplingOrigin: pd.Timestamp("2025-01-02 12:00:13"), ], ) - def test_origin(self, lmdb_version_store_v1, closed, origin): + def test_origin(self, lmdb_version_store_v1, closed, origin, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" # Start and end are picked so that #bins * rule + start != end on purpose to test # the bin generation in case of end and end_day @@ -711,8 +733,9 @@ def test_origin(self, lmdb_version_store_v1, closed, origin): (pd.Timestamp("2025-01-01 10:00:03"), pd.Timestamp("2025-01-02 12:00:00")), # end is multiple of rule ], ) - def test_origin_is_multiple_of_freq(self, lmdb_version_store_v1, closed, origin, date_range): + def test_origin_is_multiple_of_freq(self, lmdb_version_store_v1, closed, origin, date_range, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" start, end = date_range idx = pd.date_range(start, end, freq="10s") @@ -742,8 +765,9 @@ def test_origin_is_multiple_of_freq(self, lmdb_version_store_v1, closed, origin, "epoch", ], ) - def test_pre_epoch_data(self, lmdb_version_store_v1, closed, origin): + def test_pre_epoch_data(self, lmdb_version_store_v1, closed, origin, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" start = pd.Timestamp("1800-01-01 10:00:00") end = pd.Timestamp("1800-01-02 10:00:00") @@ -782,8 +806,9 @@ def test_pre_epoch_data(self, lmdb_version_store_v1, closed, origin): ) ), ) - def test_origin_off_by_one_on_boundary(self, lmdb_version_store_v1, closed, origin, date_range): + def test_origin_off_by_one_on_boundary(self, lmdb_version_store_v1, closed, origin, date_range, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" start, end = date_range idx = pd.date_range(start, end, freq="10s") @@ -812,8 +837,9 @@ def test_origin_off_by_one_on_boundary(self, lmdb_version_store_v1, closed, orig ), ], ) - def test_non_epoch_origin_throws_with_daterange(self, lmdb_version_store_v1, origin, closed): + def test_non_epoch_origin_throws_with_daterange(self, lmdb_version_store_v1, origin, closed, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_start_throws_with_daterange" lib.write( @@ -832,8 +858,9 @@ def test_non_epoch_origin_throws_with_daterange(self, lmdb_version_store_v1, ori assert all(w in str(exception_info.value) for w in [origin, "origin"]) @pytest.mark.parametrize("origin", ["epoch", pd.Timestamp("2025-01-03 12:00:00")]) - def test_epoch_and_ts_origin_works_with_date_range(self, lmdb_version_store_v1, closed, origin): + def test_epoch_and_ts_origin_works_with_date_range(self, lmdb_version_store_v1, closed, origin, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" # Start and end are picked so that #bins * rule + start != end on purpose to test # the bin generation in case of end and end_day @@ -872,8 +899,9 @@ def test_epoch_and_ts_origin_works_with_date_range(self, lmdb_version_store_v1, ], ) @pytest.mark.parametrize("offset", ["10s", "13s", "2min"]) -def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, offset): +def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, offset, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_origin_special_values" # Start and end are picked so that #bins * rule + start != end on purpose to test # the bin generation in case of end and end_day @@ -896,8 +924,9 @@ def test_origin_offset_combined(lmdb_version_store_v1, closed, origin, label, of ) -def test_max_with_one_infinity_element(lmdb_version_store_v1): +def test_max_with_one_infinity_element(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_max_with_one_infinity_element" lib.write(sym, pd.DataFrame({"col": [np.inf]}, index=pd.DatetimeIndex([pd.Timestamp("2024-01-01")]))) @@ -906,8 +935,9 @@ def test_max_with_one_infinity_element(lmdb_version_store_v1): assert np.isinf(lib.read(sym, query_builder=q).data["col_max"][0]) -def test_min_with_one_infinity_element(lmdb_version_store_v1): +def test_min_with_one_infinity_element(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_min_with_one_infinity_element" lib.write(sym, pd.DataFrame({"col": [-np.inf]}, index=pd.DatetimeIndex([pd.Timestamp("2024-01-01")]))) @@ -916,8 +946,9 @@ def test_min_with_one_infinity_element(lmdb_version_store_v1): assert np.isneginf(lib.read(sym, query_builder=q).data["col_min"][0]) -def test_date_range_outside_symbol_timerange(lmdb_version_store_v1): +def test_date_range_outside_symbol_timerange(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_date_range_outside_symbol_timerange" df = pd.DataFrame({"col": np.arange(10)}, index=pd.date_range("2025-01-01", periods=10)) lib.write(sym, df) @@ -938,9 +969,12 @@ class TestResampleDynamicSchema: @pytest.mark.parametrize("label", ["left", "right"]) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("dtype", [np.int32, np.float32, np.uint16]) - def test_aggregation_column_not_in_segment(self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype): + def test_aggregation_column_not_in_segment( + self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype, any_output_format + ): rule = "10ns" lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "sym" df1 = pd.DataFrame( {"aggregated": np.array([1, 2, 3], dtype), "_empty_bucket_tracker_": [0] * 3}, @@ -984,7 +1018,7 @@ def test_aggregation_column_not_in_segment(self, lmdb_version_store_dynamic_sche @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("dtype", [np.int32, np.float32, np.uint16]) def test_bucket_intersects_two_segments_aggregation_column_not_in_first( - self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype + self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype, any_output_format ): rule = "10ns" df1 = pd.DataFrame({"col_0": np.array([1], dtype)}, index=pd.DatetimeIndex([pd.Timestamp(0)])) @@ -997,6 +1031,7 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_first( df_list = [df1, df2, df3] lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "sym" for df in df_list: lib.append(sym, df) @@ -1027,9 +1062,10 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_first( @pytest.mark.parametrize("label", ["left", "right"]) @pytest.mark.parametrize("closed", ["left", "right"]) def test_bucket_intersects_two_segments_aggregation_column_not_in_second( - self, lmdb_version_store_dynamic_schema_v1, label, closed + self, lmdb_version_store_dynamic_schema_v1, label, closed, any_output_format ): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) dtype = np.int32 df1 = pd.DataFrame({"col_0": np.array([1], dtype)}, index=pd.DatetimeIndex([pd.Timestamp(0)])) df2 = pd.DataFrame({"col_1": np.array([50], dtype)}, index=pd.DatetimeIndex([pd.Timestamp(1)])) @@ -1069,12 +1105,15 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_second( @pytest.mark.parametrize("label", ["left", "right"]) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("dtype", [np.int32, np.float32, np.uint16]) - def test_bucket_spans_two_segments(self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype): + def test_bucket_spans_two_segments( + self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype, any_output_format + ): """ Both segments belong to the same bucket. Resampling two columns, col_0 is only in the first segment, col_1 is only in the second segment. """ lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_bucket_spans_two_segments" df0 = pd.DataFrame(data={"col_0": np.array([1], dtype=dtype)}, index=[pd.Timestamp(1)]) df1 = pd.DataFrame(data={"col_1": np.array([2], dtype=dtype)}, index=[pd.Timestamp(2)]) @@ -1124,12 +1163,15 @@ def test_bucket_spans_two_segments(self, lmdb_version_store_dynamic_schema_v1, l @pytest.mark.parametrize("label", ["left", "right"]) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("dtype", [np.int32, np.float32, np.uint16]) - def test_bucket_spans_three_segments(self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype): + def test_bucket_spans_three_segments( + self, lmdb_version_store_dynamic_schema_v1, label, closed, dtype, any_output_format + ): """ Both segments belong to the same bucket. Resampling two columns, col_0 is only in the first segment, col_1 is only in the second segment. """ lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_bucket_spans_two_segments" df0 = pd.DataFrame( {"col_0": np.array([0, 0], dtype=dtype)}, index=pd.to_datetime([pd.Timestamp(0), pd.Timestamp(1)]) @@ -1211,7 +1253,9 @@ def test_bucket_spans_three_segments(self, lmdb_version_store_dynamic_schema_v1, bool, ], ) - def test_sum_aggregation_type(self, lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype): + def test_sum_aggregation_type( + self, lmdb_version_store_dynamic_schema_v1, first_dtype, second_dtype, any_output_format + ): """ Sum aggregation in resamling promotes to the largest type of the respective category. int -> int64, uint -> uint64, float -> float64. Dynamic schema allows mixing int and uint. @@ -1220,6 +1264,7 @@ def test_sum_aggregation_type(self, lmdb_version_store_dynamic_schema_v1, first_ configurations of dtypes and grouping options (same group vs different group) """ lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) df1 = pd.DataFrame({"to_sum": np.array([1], first_dtype)}, index=pd.DatetimeIndex([pd.Timestamp(1)])) df2 = pd.DataFrame({"to_sum": np.array([1], second_dtype)}, index=pd.DatetimeIndex([pd.Timestamp(2)])) lib.write("sym", df1) @@ -1237,8 +1282,11 @@ def test_sum_aggregation_type(self, lmdb_version_store_dynamic_schema_v1, first_ @pytest.mark.parametrize("label", ["left", "right"]) @pytest.mark.parametrize("closed", ["left", "right"]) - def test_middle_segment_does_not_contain_column(self, lmdb_version_store_dynamic_schema_v1, label, closed): + def test_middle_segment_does_not_contain_column( + self, lmdb_version_store_dynamic_schema_v1, label, closed, any_output_format + ): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_middle_segment_does_not_contain_column" rule = "10ns" origin = "epoch" @@ -1291,8 +1339,9 @@ def test_middle_segment_does_not_contain_column(self, lmdb_version_store_dynamic expected_types=expected_types, ) - def test_int_float_promotion(self, lmdb_version_store_dynamic_schema_v1): + def test_int_float_promotion(self, lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_int_float_promotion" rule = "10ns" origin = "epoch" diff --git a/python/tests/unit/arcticdb/version_store/test_row_range.py b/python/tests/unit/arcticdb/version_store/test_row_range.py index a51784e216..4fd9acb36c 100644 --- a/python/tests/unit/arcticdb/version_store/test_row_range.py +++ b/python/tests/unit/arcticdb/version_store/test_row_range.py @@ -15,7 +15,7 @@ from arcticdb.util.test import assert_frame_equal -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered def generic_row_range_test(version_store, symbol, df, start_row, end_row): @@ -38,41 +38,49 @@ def generic_row_range_test(version_store, symbol, df, start_row, end_row): np.testing.assert_array_equal(expected_array, received_array_via_querybuilder) -def test_row_range_start_row_greater_than_end_row(lmdb_version_store, one_col_df): +def test_row_range_start_row_greater_than_end_row(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) generic_row_range_test(lmdb_version_store, "test_row_range_start_row_greater_than_end_row", one_col_df(), 3, 2) -def test_row_range_zero_num_rows(lmdb_version_store, one_col_df): +def test_row_range_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) generic_row_range_test(lmdb_version_store, "test_row_range_zero_num_rows", one_col_df(), 2, 2) -def test_row_range_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_one_num_rows", one_col_df(), 2, 3) -def test_row_range_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_row_range_test( lmdb_version_store_tiny_segment, "test_row_range_segment_boundary_num_rows", one_col_df(), 2, 4 ) -def test_row_range_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_multiple_segments", one_col_df(), 3, 7) -def test_row_range_all_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_all_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_all_rows", one_col_df(), 0, 10) -def test_row_range_past_end(lmdb_version_store_tiny_segment, one_col_df): +def test_row_range_past_end(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_row_range_test(lmdb_version_store_tiny_segment, "test_row_range_past_end", one_col_df(), 5, 15) -def test_row_range_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): +def test_row_range_with_column_filter(lmdb_version_store_tiny_segment, three_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_row_range_with_column_filter" lmdb_version_store_tiny_segment.write(symbol, three_col_df()) # lmdb_version_store_tiny_segment has column_group_size set to 2 @@ -86,7 +94,8 @@ def test_row_range_with_column_filter(lmdb_version_store_tiny_segment, three_col ) -def test_row_range_pickled_symbol(lmdb_version_store): +def test_row_range_pickled_symbol(lmdb_version_store, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_row_range_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) @@ -108,7 +117,8 @@ def test_row_range_pickled_symbol(lmdb_version_store): ), ) @pytest.mark.parametrize("api", ("query_builder", "read", "read_batch")) -def test_row_range_open_ended(lmdb_version_store_v1, api, row_range, expected): +def test_row_range_open_ended(lmdb_version_store_v1, api, row_range, expected, any_output_format): + lmdb_version_store_v1._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_row_range" df = pd.DataFrame({"a": np.arange(100)}) lmdb_version_store_v1.write(symbol, df) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 533909c8af..335c7fea8f 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -16,7 +16,7 @@ from arcticdb.util.test import assert_frame_equal, assert_series_equal from tests.util.mark import MACOS_WHEEL_BUILD, WINDOWS -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered @pytest.mark.parametrize("dynamic_schema", [True, False]) @@ -24,12 +24,15 @@ @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, index, join): +def test_symbol_concat_basic( + lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, index, join, any_output_format +): lib = lmdb_library_factory( LibraryOptions( dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment ) ) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -76,8 +79,9 @@ def test_symbol_concat_basic(lmdb_library_factory, dynamic_schema, rows_per_segm @pytest.mark.parametrize( "second_type", ["uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float32", "float64"] ) -def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type): +def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df0 = pd.DataFrame({"col": np.arange(1, dtype=np.dtype(first_type))}) df1 = pd.DataFrame({"col": np.arange(1, dtype=np.dtype(second_type))}) lib.write("sym0", df0) @@ -99,8 +103,9 @@ def test_symbol_concat_type_promotion(lmdb_library, first_type, second_type): @pytest.mark.parametrize("name_0", [None, "", "s1", "s2"]) @pytest.mark.parametrize("name_1", [None, "", "s1", "s2"]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_with_series(lmdb_library_factory, index, name_0, name_1, join): +def test_symbol_concat_with_series(lmdb_library_factory, index, name_0, name_1, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) s_0 = pd.Series(np.arange(8, dtype=np.float64), index=index, name=name_0) s_1 = pd.Series(np.arange(10, 18, dtype=np.float64), index=index, name=name_1) lib.write("sym0", s_0) @@ -111,14 +116,23 @@ def test_symbol_concat_with_series(lmdb_library_factory, index, name_0, name_1, if index is None: expected.index = pd.RangeIndex(len(expected)) expected.name = name_0 if name_0 == name_1 else None - assert_series_equal(expected, received) + if isinstance(received, pd.Series): + assert_series_equal(expected, received) + else: + # TODO: Resolve the skip + pytest.skip("Need to figure out how to convert the series to a dataframe like arrow and keep column name types") + expected = pd.DataFrame(expected) + assert_frame_equal(expected, received) @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_different_column_sets(lmdb_library_factory, dynamic_schema, columns_per_segment, join): +def test_symbol_concat_different_column_sets( + lmdb_library_factory, dynamic_schema, columns_per_segment, join, any_output_format +): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) # Use floats and strings so that our backfilling and Pandas' match df_0 = pd.DataFrame( { @@ -149,8 +163,11 @@ def test_symbol_concat_different_column_sets(lmdb_library_factory, dynamic_schem @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) -def test_symbol_concat_integer_columns_outer_join(lmdb_library_factory, dynamic_schema, columns_per_segment): +def test_symbol_concat_integer_columns_outer_join( + lmdb_library_factory, dynamic_schema, columns_per_segment, any_output_format +): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.int64), @@ -181,8 +198,9 @@ def test_symbol_concat_integer_columns_outer_join(lmdb_library_factory, dynamic_ @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join): +def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=True)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.float64), @@ -227,9 +245,10 @@ def test_symbol_concat_dynamic_schema_missing_columns(lmdb_library_factory, join @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=5)]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_symbol_concat_empty_column_intersection( - lmdb_library_factory, dynamic_schema, columns_per_segment, index, join + lmdb_library_factory, dynamic_schema, columns_per_segment, index, join, any_output_format ): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema, columns_per_segment=columns_per_segment)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(5, dtype=np.float64), @@ -269,13 +288,14 @@ def test_symbol_concat_empty_column_intersection( ) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_symbol_concat_column_slicing( - lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, columns, join + lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, columns, join, any_output_format ): lib = lmdb_library_factory( LibraryOptions( dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment ) ) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -305,8 +325,9 @@ def test_symbol_concat_column_slicing( @pytest.mark.parametrize("dynamic_schema", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dynamic_schema, join): +def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dynamic_schema, join, any_output_format): lib = lmdb_library_factory(LibraryOptions(dynamic_schema=dynamic_schema)) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -339,8 +360,9 @@ def test_symbol_concat_filtering_with_column_selection(lmdb_library_factory, dyn @pytest.mark.parametrize("only_incompletes", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes, join): +def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes, join, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) if not only_incompletes: df_0 = pd.DataFrame( {"col1": np.arange(3, dtype=np.float64), "col2": np.arange(3, 6, dtype=np.float64)}, @@ -386,13 +408,14 @@ def test_symbol_concat_with_streaming_incompletes(lmdb_library, only_incompletes @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_symbol_concat_multiindex_basic( - lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join + lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join, any_output_format ): lib = lmdb_library_factory( LibraryOptions( dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment ) ) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame( { "col1": np.arange(12, dtype=np.int64), @@ -412,8 +435,9 @@ def test_symbol_concat_multiindex_basic( @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_with_date_range(lmdb_library, join): +def test_symbol_concat_with_date_range(lmdb_library, join, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -444,12 +468,15 @@ def test_symbol_concat_with_date_range(lmdb_library, join): @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_complex(lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join): +def test_symbol_concat_complex( + lmdb_library_factory, dynamic_schema, rows_per_segment, columns_per_segment, join, any_output_format +): lib = lmdb_library_factory( LibraryOptions( dynamic_schema=dynamic_schema, rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment ) ) + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -495,8 +522,9 @@ def test_symbol_concat_complex(lmdb_library_factory, dynamic_schema, rows_per_se assert_frame_equal(expected, received) -def test_symbol_concat_querybuilder_syntax(lmdb_library): +def test_symbol_concat_querybuilder_syntax(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -543,8 +571,9 @@ def test_symbol_concat_querybuilder_syntax(lmdb_library): @pytest.mark.parametrize("index_name_0", [None, "ts1", "ts2"]) @pytest.mark.parametrize("index_name_1", [None, "ts1", "ts2"]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, index_name_1, join): +def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, index_name_1, join, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( {"col1": np.arange(1, dtype=np.float64), "col2": np.arange(1, 2, dtype=np.float64)}, index=[pd.Timestamp(0)] ) @@ -567,9 +596,16 @@ def test_symbol_concat_differently_named_timeseries(lmdb_library, index_name_0, @pytest.mark.parametrize("index_name_1_level_1", [None, "hello", "goodbye"]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_symbol_concat_differently_named_multiindexes( - lmdb_library, index_name_0_level_0, index_name_0_level_1, index_name_1_level_0, index_name_1_level_1, join + lmdb_library, + index_name_0_level_0, + index_name_0_level_1, + index_name_1_level_0, + index_name_1_level_1, + join, + any_output_format, ): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(1, dtype=np.float64), @@ -603,8 +639,9 @@ def test_symbol_concat_differently_named_multiindexes( @pytest.mark.parametrize("tz_0", [None, "Europe/Amsterdam", "US/Eastern"]) @pytest.mark.parametrize("tz_1", [None, "Europe/Amsterdam", "US/Eastern"]) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_timezone_handling(lmdb_library, tz_0, tz_1, join): +def test_symbol_concat_timezone_handling(lmdb_library, tz_0, tz_1, join, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(1, dtype=np.float64), @@ -633,9 +670,10 @@ def test_symbol_concat_timezone_handling(lmdb_library, tz_0, tz_1, join): @pytest.mark.parametrize("tz_1_level_1", [None, "Europe/Amsterdam", "Australia/Sydney"]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_symbol_concat_multiindex_timezone_handling( - lmdb_library, tz_0_level_0, tz_0_level_1, tz_1_level_0, tz_1_level_1, join + lmdb_library, tz_0_level_0, tz_0_level_1, tz_1_level_0, tz_1_level_1, join, any_output_format ): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame( { "col1": np.arange(1, dtype=np.float64), @@ -679,8 +717,9 @@ def test_symbol_concat_multiindex_timezone_handling( @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_symbol_concat_symbols_with_different_indexes(lmdb_library, join): +def test_symbol_concat_symbols_with_different_indexes(lmdb_library, join, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df_0 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1)) df_1 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)]) dt1 = pd.Timestamp(0) @@ -712,16 +751,18 @@ def test_symbol_concat_symbols_with_different_indexes(lmdb_library, join): concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True), join).collect() -def test_symbol_concat_non_existent_symbol(lmdb_library): +def test_symbol_concat_non_existent_symbol(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) sym = "test_symbol_concat_non_existent_symbol" lib.write(sym, pd.DataFrame({"col": [0]})) with pytest.raises(NoSuchVersionException): concat(lib.read_batch([sym, "non-existent symbol"], lazy=True)).collect() -def test_symbol_concat_pickled_data(lmdb_library): +def test_symbol_concat_pickled_data(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df = pd.DataFrame({"bytes": np.arange(10, dtype=np.uint64)}) pickled_data = {"hi", "there"} lib.write("sym0", df) @@ -731,8 +772,9 @@ def test_symbol_concat_pickled_data(lmdb_library): concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect() -def test_symbol_concat_docstring_example(lmdb_library): +def test_symbol_concat_docstring_example(lmdb_library, any_output_format): lib = lmdb_library + lib._nvs._set_output_format_for_pipeline_tests(any_output_format) df0 = pd.DataFrame( { "col": [0, 1, 2, 3, 4], diff --git a/python/tests/unit/arcticdb/version_store/test_tail.py b/python/tests/unit/arcticdb/version_store/test_tail.py index c991a387d4..45d7a75b92 100644 --- a/python/tests/unit/arcticdb/version_store/test_tail.py +++ b/python/tests/unit/arcticdb/version_store/test_tail.py @@ -15,7 +15,7 @@ from arcticdb_ext.exceptions import InternalException -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered def generic_tail_test(version_store, symbol, df, num_rows): @@ -25,65 +25,79 @@ def generic_tail_test(version_store, symbol, df, num_rows): assert np.array_equal(expected, actual) -def test_tail_large_segment(lmdb_version_store): +def test_tail_large_segment(lmdb_version_store, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) df = DataFrame({"x": np.arange(100_000, dtype=np.int64)}) generic_tail_test(lmdb_version_store, "test_tail_large_segment", df, 50_000) -def test_tail_zero_num_rows(lmdb_version_store, one_col_df): +def test_tail_zero_num_rows(lmdb_version_store, one_col_df, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) generic_tail_test(lmdb_version_store, "test_tail_zero_num_rows", one_col_df(), 0) -def test_tail_one_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_one_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_one_num_rows", one_col_df(), 1) -def test_tail_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_segment_boundary_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_segment_boundary_num_rows", one_col_df(), 2) -def test_tail_multiple_segments(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_multiple_segments(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # lmdb_version_store_tiny_segment has segment_row_size set to 2 generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_multiple_segments", one_col_df(), 7) -def test_tail_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_negative_num_rows", one_col_df(), -7) -def test_tail_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_num_rows_greater_than_table_length", one_col_df(), 10) -def test_tail_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows_equals_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_negative_num_rows_equals_table_length", one_col_df(), -10 ) -def test_tail_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_tail_test(lmdb_version_store_tiny_segment, "test_tail_num_rows_greater_than_table_length", one_col_df(), 11) -def test_tail_negative_num_rows_greater_than_table_length(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_negative_num_rows_greater_than_table_length( + lmdb_version_store_tiny_segment, one_col_df, any_output_format +): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) # one_col_df generates a dataframe with 10 rows generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_negative_num_rows_greater_than_table_length", one_col_df(), -11 ) -def test_tail_default_num_rows(lmdb_version_store_tiny_segment, one_col_df): +def test_tail_default_num_rows(lmdb_version_store_tiny_segment, one_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_tail_default_num_rows" lmdb_version_store_tiny_segment.write(symbol, one_col_df()) num_rows = signature(lmdb_version_store_tiny_segment.tail).parameters["n"].default assert np.array_equal(one_col_df().tail(num_rows), lmdb_version_store_tiny_segment.tail(symbol).data) -def test_tail_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): +def test_tail_with_column_filter(lmdb_version_store_tiny_segment, three_col_df, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_tail_with_column_filter" lmdb_version_store_tiny_segment.write(symbol, three_col_df()) # lmdb_version_store_tiny_segment has column_group_size set to 2 @@ -96,7 +110,8 @@ def test_tail_with_column_filter(lmdb_version_store_tiny_segment, three_col_df): ) -def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment): +def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment, any_output_format): + lmdb_version_store_tiny_segment._set_output_format_for_pipeline_tests(any_output_format) generic_tail_test( lmdb_version_store_tiny_segment, "test_tail_multiple_segments_odd_total_rows", @@ -105,7 +120,8 @@ def test_tail_multiple_segments_odd_total_rows(lmdb_version_store_tiny_segment): ) -def test_tail_pickled_symbol(lmdb_version_store): +def test_tail_pickled_symbol(lmdb_version_store, any_output_format): + lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_tail_pickled_symbol" lmdb_version_store.write(symbol, np.arange(100).tolist()) assert lmdb_version_store.is_symbol_pickled(symbol) diff --git a/python/tests/unit/arcticdb/version_store/test_ternary.py b/python/tests/unit/arcticdb/version_store/test_ternary.py index e578cb03d4..a2d071c2fb 100644 --- a/python/tests/unit/arcticdb/version_store/test_ternary.py +++ b/python/tests/unit/arcticdb/version_store/test_ternary.py @@ -21,7 +21,7 @@ from tests.util.mark import WINDOWS -pytestmark = pytest.mark.pipeline +pytestmark = pytest.mark.pipeline # Covered # A lot of the tests in here are designed to test specific code paths in operation_dispatch_ternary.cpp. In particular, @@ -29,8 +29,9 @@ # holding those particular types -def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v1): +def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_condition_as_full_and_empty_result" df = pd.DataFrame( { @@ -59,8 +60,9 @@ def test_project_ternary_condition_as_full_and_empty_result(lmdb_version_store_v assert_frame_equal(expected, received) -def test_project_ternary_column_column_numeric(lmdb_version_store_v1): +def test_project_ternary_column_column_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_column_column_numeric" df = pd.DataFrame( { @@ -108,8 +110,9 @@ def test_project_ternary_column_column_numeric(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1): +def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_column_column_dynamic_strings" df = pd.DataFrame( { @@ -130,8 +133,9 @@ def test_project_ternary_column_column_dynamic_strings(lmdb_version_store_v1): @pytest.mark.skipif(WINDOWS, reason="We do not support fixed-width strings on Windows") -def test_project_ternary_fixed_width_strings(version_store_factory): +def test_project_ternary_fixed_width_strings(version_store_factory, any_output_format): lib = version_store_factory(dynamic_strings=False) + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_fixed_width_strings" df = pd.DataFrame( { @@ -156,8 +160,9 @@ def test_project_ternary_fixed_width_strings(version_store_factory): lib.read(symbol, query_builder=q) -def test_project_ternary_column_value_numeric(lmdb_version_store_v1): +def test_project_ternary_column_value_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_column_value_numeric" df = pd.DataFrame( { @@ -183,8 +188,9 @@ def test_project_ternary_column_value_numeric(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_column_value_strings(lmdb_version_store_v1): +def test_project_ternary_column_value_strings(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_column_value_strings" df = pd.DataFrame( { @@ -210,8 +216,9 @@ def test_project_ternary_column_value_strings(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_value_value_numeric(lmdb_version_store_v1): +def test_project_ternary_value_value_numeric(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_value_value_numeric" df = pd.DataFrame( { @@ -230,8 +237,9 @@ def test_project_ternary_value_value_numeric(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_value_value_string(lmdb_version_store_v1): +def test_project_ternary_value_value_string(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_value_value_string" df = pd.DataFrame( { @@ -258,10 +266,11 @@ def test_project_ternary_value_value_string(lmdb_version_store_v1): ), ], ) -def test_project_ternary_column_sliced(version_store_factory, index): +def test_project_ternary_column_sliced(version_store_factory, index, any_output_format): # Cannot use lmdb_version_store_tiny_segment as it has fixed-width strings, which are not supported with the ternary # operator lib = version_store_factory(dynamic_strings=True, column_group_size=2, segment_row_size=2) + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_column_sliced_range_index" # This fixture has 2 columns per slice, so the column groups will be: # - ["conditional", num_1] @@ -298,8 +307,9 @@ def test_project_ternary_column_sliced(version_store_factory, index): assert_frame_equal(expected, received) -def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1): +def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_dynamic_missing_columns" all_columns_df = pd.DataFrame( { @@ -415,8 +425,9 @@ def test_project_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_sche assert_frame_equal(expected, received) -def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dynamic_schema_v1): +def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_project_ternary_dynamic_missing_columns_strings" all_columns_df = pd.DataFrame( { @@ -522,8 +533,9 @@ def test_project_ternary_dynamic_missing_columns_strings(lmdb_version_store_dyna assert_frame_equal(expected, received) -def test_project_ternary_sparse_col_val(lmdb_version_store_v1): +def test_project_ternary_sparse_col_val(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_project_ternary_sparse_col_val" df = pd.DataFrame( { @@ -567,8 +579,9 @@ def test_project_ternary_sparse_col_val(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_sparse_col_col(lmdb_version_store_v1): +def test_project_ternary_sparse_col_col(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_project_ternary_sparse_col_col" df = pd.DataFrame( { @@ -641,8 +654,9 @@ def test_project_ternary_sparse_col_col(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_project_ternary_condition_empty(lmdb_version_store_v1): +def test_project_ternary_condition_empty(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) sym = "test_project_ternary_condition_empty" df = pd.DataFrame( {"condition": [0.0, 0.0, 0.0], "col1": [0.0, np.nan, np.nan], "col2": [0.0, np.nan, np.nan]}, @@ -657,8 +671,9 @@ def test_project_ternary_condition_empty(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_bitset_bitset(lmdb_version_store_v1): +def test_filter_ternary_bitset_bitset(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_bitset_bitset" df = pd.DataFrame( { @@ -677,8 +692,9 @@ def test_filter_ternary_bitset_bitset(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_bitset_column(lmdb_version_store_v1): +def test_filter_ternary_bitset_column(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_bitset_column" df = pd.DataFrame( { @@ -703,8 +719,9 @@ def test_filter_ternary_bitset_column(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_bool_columns(lmdb_version_store_v1): +def test_filter_ternary_bool_columns(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_bool_columns" df = pd.DataFrame( { @@ -741,8 +758,9 @@ def test_filter_ternary_bool_columns(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_bitset_value(lmdb_version_store_v1): +def test_filter_ternary_bitset_value(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_bitset_value" df = pd.DataFrame( { @@ -778,8 +796,9 @@ def test_filter_ternary_bitset_value(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1): +def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_bitset_full_and_empty_results" df = pd.DataFrame( { @@ -819,8 +838,9 @@ def test_filter_ternary_bitset_full_and_empty_results(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1): +def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_column_full_and_empty_results" df = pd.DataFrame( { @@ -862,8 +882,9 @@ def test_filter_ternary_column_full_and_empty_results(lmdb_version_store_v1): @pytest.mark.parametrize("value", [True, False]) -def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, value): +def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, value, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_value_full_and_empty_results" df = pd.DataFrame( { @@ -903,8 +924,9 @@ def test_filter_ternary_value_full_and_empty_results(lmdb_version_store_v1, valu assert_frame_equal(expected, received) -def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1): +def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_full_and_empty_results_squared" df = pd.DataFrame( { @@ -946,8 +968,9 @@ def test_filter_ternary_full_and_empty_results_squared(lmdb_version_store_v1): assert_frame_equal(expected, received) -def test_filter_ternary_invalid_conditions(lmdb_version_store_v1): +def test_filter_ternary_invalid_conditions(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_invalid_conditions" # Non-bool column should throw if provided as condition df = pd.DataFrame({"conditional": [0]}) @@ -966,8 +989,9 @@ def test_filter_ternary_invalid_conditions(lmdb_version_store_v1): lib.read(symbol, query_builder=q) -def test_filter_ternary_invalid_arguments(lmdb_version_store_v1): +def test_filter_ternary_invalid_arguments(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_invalid_arguments" df = pd.DataFrame( {"conditional": [True], "col1": [0], "col2": ["hello"]}, @@ -1021,8 +1045,9 @@ def test_filter_ternary_pythonic_syntax(): q[q["col1"] if q["conditional"] else q["col2"]] -def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1): +def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schema_v1, any_output_format): lib = lmdb_version_store_dynamic_schema_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) symbol = "test_filter_ternary_dynamic_missing_columns" all_columns_df = pd.DataFrame( { @@ -1165,7 +1190,7 @@ def test_filter_ternary_dynamic_missing_columns(lmdb_version_store_dynamic_schem ), ), ) -def test_ternary_hypothesis(lmdb_version_store_v1, df): +def test_ternary_hypothesis(lmdb_version_store_v1, df, any_output_format): assume( not df.empty and not df["condition"].isnull().all() @@ -1173,6 +1198,7 @@ def test_ternary_hypothesis(lmdb_version_store_v1, df): and not df["col2"].isnull().all() ) lib = lmdb_version_store_v1 + lib._set_output_format_for_pipeline_tests(any_output_format) dense_sym = "test_ternary_hypothesis_dense" sparse_sym = "test_ternary_hypothesis_sparse"