man-group
diff --git a/‎python/arcticdb/version_store/_store.py‎
Lines changed: 18 additions & 6 deletions b/‎python/arcticdb/version_store/_store.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py‎
Lines changed: 9 additions & 5 deletions b/‎python/tests/hypothesis/arcticdb/test_aggregation_hypothesis.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎python/tests/hypothesis/arcticdb/test_resample.py‎
Lines changed: 12 additions & 23 deletions b/‎python/tests/hypothesis/arcticdb/test_resample.py‎
Lines changed: 12 additions & 23 deletions
diff --git a/‎python/tests/integration/arcticdb/version_store/test_basic_version_store.py‎
Lines changed: 22 additions & 41 deletions b/‎python/tests/integration/arcticdb/version_store/test_basic_version_store.py‎
Lines changed: 22 additions & 41 deletions
@@ -90,6 +90,8 @@
 from packaging.version import Version
 import arcticdb_ext as ae
 
+from arcticdb.util.arrow import stringify_dictionary_encoded_columns
+
 IS_WINDOWS = sys.platform == "win32"
 
 FlattenResult = namedtuple("FlattenResult", ["is_recursive_normalize_preferred", "metastruct", "to_write"])
@@ -354,12 +356,17 @@ def _initialize(self, library, env, lib_cfg, custom_normalizer, open_mode, nativ
         self._open_mode = open_mode
         self._native_cfg = native_cfg
         self._runtime_options = runtime_options
+        self._test_convert_arrow_back_to_pandas = False
 
     def set_output_format(self, output_format: Union[OutputFormat, str]):
         if self._runtime_options is None:
             self._runtime_options = RuntimeOptions()
         self._runtime_options.set_output_format(output_format)
 
+    def _set_output_format_for_pipeline_tests(self, output_format):
+        self.set_output_format(output_format)
+        self._test_convert_arrow_back_to_pandas = True
+
     @classmethod
     def create_store_from_lib_config(cls, lib_cfg, env, open_mode=OpenMode.DELETE, native_cfg=None):
         lib = cls.create_lib_from_lib_config(lib_cfg, env, open_mode, native_cfg)
@@ -740,6 +747,9 @@ def _resolve_dynamic_strings(self, kwargs):
                     "Windows only supports dynamic_strings=True, using dynamic strings despite configuration or kwarg"
                 )
             dynamic_strings = True
+        if self._test_convert_arrow_back_to_pandas:
+            # TODO: Hackery, maybe better to skip
+            dynamic_strings = True
         return dynamic_strings
 
     last_mismatch_msg: Optional[str] = None
@@ -2434,6 +2444,14 @@ def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem:
                 record_batches.append(pa.RecordBatch._import_from_c(record_batch.array(), record_batch.schema()))
             table = pa.Table.from_batches(record_batches)
             data = self._arrow_normalizer.denormalize(table, read_result.norm)
+            if self._test_convert_arrow_back_to_pandas:
+                # TODO: Deduplicate with convert_arrow_to_pandas_and_remove_categoricals
+                data = stringify_dictionary_encoded_columns(data)
+                for i, name in enumerate(data.column_names):
+                    if pa.types.is_integer(data.column(i).type):
+                        new_col = data.column(i).fill_null(0)
+                        data = data.set_column(i, name, new_col)
+                data = data.to_pandas()
         else:
             data = self._normalizer.denormalize(read_result.frame_data, read_result.norm)
             if read_result.norm.HasField("custom"):
@@ -2679,9 +2697,6 @@ def add_to_snapshot(
         """
         Add items to a snapshot. Will replace if the snapshot already contains an entry for a particular symbol.
 
-        Note: attempt to add non-existing symbol or version to a snapshot will not fail, but will have no effect
-              on the snapshot.
-
         Parameters
         ----------
         snap_name : `str`
@@ -2699,9 +2714,6 @@ def remove_from_snapshot(self, snap_name: str, symbols: List[str], versions: Lis
         """
         Remove items from a snapshot
 
-        Note: attempt to remove non-existing symbol or version from a snapshot will not fail, but will have no effect
-              on the snapshot.
-
         Parameters
         ----------
         snap_name : `str`
 
@@ -24,7 +24,7 @@
 )
 
 
-pytestmark = pytest.mark.pipeline
+pytestmark = pytest.mark.pipeline  # Covered
 
 
 @use_of_function_scoped_fixtures_in_hypothesis_checked
@@ -37,9 +37,10 @@
         ],
     ),
 )
-def test_aggregation_numeric(lmdb_version_store_v1, df):
+def test_aggregation_numeric(lmdb_version_store_v1, any_output_format, df):
     assume(not df.empty)
     lib = lmdb_version_store_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     symbol = "test_aggregation_numeric"
     lib.write(symbol, df)
 
@@ -71,9 +72,10 @@ def test_aggregation_numeric(lmdb_version_store_v1, df):
         ],
     ),
 )
-def test_aggregation_strings(lmdb_version_store_v1, df):
+def test_aggregation_strings(lmdb_version_store_v1, any_output_format, df):
     assume(not df.empty)
     lib = lmdb_version_store_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     symbol = "test_aggregation_strings"
     lib.write(symbol, df)
 
@@ -116,12 +118,13 @@ def aggregation_dataframe_list_strategy(draw):
 @use_of_function_scoped_fixtures_in_hypothesis_checked
 @settings(deadline=None)
 @given(dfs=aggregation_dataframe_list_strategy())
-def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, dfs):
+def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, dfs):
     agg_column_dtypes = [df["agg_column"].dtype for df in dfs if "agg_column" in df.columns]
     common_agg_type = functools.reduce(valid_common_type, agg_column_dtypes) if len(agg_column_dtypes) > 0 else None
     assume(any("grouping_column" in df.columns for df in dfs) and common_agg_type is not None)
 
     lib = lmdb_version_store_dynamic_schema_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     symbol = "test_aggregation_numeric_dynamic"
     lib.delete(symbol)
     for df in dfs:
@@ -160,9 +163,10 @@ def test_aggregation_numeric_dynamic(lmdb_version_store_dynamic_schema_v1, dfs):
         ],
     ),
 )
-def test_aggregation_strings_dynamic(lmdb_version_store_dynamic_schema_v1, df):
+def test_aggregation_strings_dynamic(lmdb_version_store_dynamic_schema_v1, any_output_format, df):
     assume(len(df) >= 3)
     lib = lmdb_version_store_dynamic_schema_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     symbol = "test_aggregation_strings_dynamic"
     lib.delete(symbol)
     slices = [
 
@@ -18,19 +18,10 @@
 
 COLUMN_DTYPE = ["float", "int", "uint"]
 ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"]
-# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well.
-MIN_DATE = np.datetime64("1960-01-01")
-MAX_DATE = np.datetime64("2025-01-01")
+MIN_DATE = np.datetime64("1969-06-01")
+MAX_DATE = np.datetime64("1970-06-01")
 
-pytestmark = pytest.mark.pipeline
-
-
-def dense_row_count_in_resampled_dataframe(df_list, rule):
-    """
-    The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling
-    with `rule`.  Assumes df_list is sorted by start date and the indexes are not overlapping.
-    """
-    return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value
+pytestmark = pytest.mark.pipeline  # Covered
 
 
 @st.composite
@@ -111,22 +102,22 @@ def freq_fits_in_64_bits(count, unit):
     This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer.
     """
     billion = 1_000_000_000
-    mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1}
+    mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion}
     return (mult[unit] * count).bit_length() <= 63
 
 
 @st.composite
 def rule(draw):
     count = draw(st.integers(min_value=1, max_value=10_000))
-    unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"]))
+    unit = draw(st.sampled_from(["min", "h", "s"]))
     result = f"{count}{unit}"
     assume(freq_fits_in_64_bits(count=count, unit=unit))
     return result
 
 
 @st.composite
 def offset(draw):
-    unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None]))
+    unit = draw(st.sampled_from(["s", "min", "h", None]))
     if unit is None:
         return None
     count = draw(st.integers(min_value=1, max_value=100))
@@ -181,11 +172,9 @@ def dynamic_schema_column_list(draw):
     origin=origin(),
     offset=offset(),
 )
-def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
-    # The assumption below is to avoid OOM-ing the GitHub runners.
-    assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000)
-
+def test_resample(lmdb_version_store_v1, any_output_format, df, rule, origin, offset):
     lib = lmdb_version_store_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     sym = "sym"
     logger = get_logger()
     logger.info(f"Data frame generated has {df.shape[0]} rows")
@@ -231,12 +220,12 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
 @use_of_function_scoped_fixtures_in_hypothesis_checked
 @given(df_list=dynamic_schema_column_list(), rule=rule(), origin=origin(), offset=offset())
 @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
-def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
-    # The assumption below is to avoid OOM-ing the GitHub runners.
-    assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000)
-
+def test_resample_dynamic_schema(
+    lmdb_version_store_dynamic_schema_v1, any_output_format, df_list, rule, origin, offset
+):
     common_column_types = compute_common_type_for_columns_in_df_list(df_list)
     lib = lmdb_version_store_dynamic_schema_v1
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     lib.version_store.clear()
     sym = "sym"
     agg = {f"{name}_{op}": (name, op) for name in common_column_types for op in ALL_AGGREGATIONS}
 
@@ -27,20 +27,14 @@
     ArcticDbNotYetImplemented,
     InternalException,
     UserInputException,
-    ArcticException,
 )
 from arcticdb import QueryBuilder
 from arcticdb.flattener import Flattener
 from arcticdb.version_store import NativeVersionStore
 from arcticdb.version_store._store import VersionedItem
 from arcticdb_ext.exceptions import _ArcticLegacyCompatibilityException, StorageException
 from arcticdb_ext.storage import KeyType, NoDataFoundException
-from arcticdb_ext.version_store import (
-    NoSuchVersionException,
-    StreamDescriptorMismatch,
-    ManualClockVersionStore,
-    DataError,
-)
+from arcticdb_ext.version_store import NoSuchVersionException, StreamDescriptorMismatch, ManualClockVersionStore
 from arcticdb.util.test import (
     sample_dataframe,
     sample_dataframe_only_strings,
@@ -50,12 +44,10 @@
     config_context,
     distinct_timestamps,
 )
-from tests.conftest import Marks
 from tests.util.date import DateRange
 from arcticdb.util.test import equals
 from arcticdb.version_store._store import resolve_defaults
 from tests.util.mark import MACOS, MACOS_WHEEL_BUILD, xfail_azure_chars
-from tests.util.marking import marks
 
 
 @pytest.fixture()
@@ -830,9 +822,11 @@ def test_range_index(basic_store, sym):
     assert_equal(expected, vit.data)
 
 
+@pytest.mark.pipeline  # Covered
 @pytest.mark.parametrize("use_date_range_clause", [True, False])
-@marks([Marks.pipeline, Marks.storage])
-def test_date_range(basic_store, use_date_range_clause):
+@pytest.mark.storage
+def test_date_range(basic_store, use_date_range_clause, any_output_format):
+    basic_store._set_output_format_for_pipeline_tests(any_output_format)
     initial_timestamp = pd.Timestamp("2019-01-01")
     df = pd.DataFrame(data=np.arange(100), index=pd.date_range(initial_timestamp, periods=100))
     sym = "date_test"
@@ -878,9 +872,11 @@ def test_date_range(basic_store, use_date_range_clause):
     assert data_closed[data_closed.columns[0]][-1] == end_offset
 
 
+@pytest.mark.pipeline  # Covered
 @pytest.mark.parametrize("use_date_range_clause", [True, False])
-@marks([Marks.pipeline, Marks.storage])
-def test_date_range_none(basic_store, use_date_range_clause):
+@pytest.mark.storage
+def test_date_range_none(basic_store, use_date_range_clause, any_output_format):
+    basic_store._set_output_format_for_pipeline_tests(any_output_format)
     sym = "date_test2"
     rows = 100
     initial_timestamp = pd.Timestamp("2019-01-01")
@@ -897,9 +893,11 @@ def test_date_range_none(basic_store, use_date_range_clause):
     assert len(data) == rows
 
 
+@pytest.mark.pipeline  # Covered
 @pytest.mark.parametrize("use_date_range_clause", [True, False])
-@marks([Marks.pipeline, Marks.storage])
-def test_date_range_start_equals_end(basic_store, use_date_range_clause):
+@pytest.mark.storage
+def test_date_range_start_equals_end(basic_store, use_date_range_clause, any_output_format):
+    basic_store._set_output_format_for_pipeline_tests(any_output_format)
     sym = "date_test2"
     rows = 100
     initial_timestamp = pd.Timestamp("2019-01-01")
@@ -919,10 +917,12 @@ def test_date_range_start_equals_end(basic_store, use_date_range_clause):
     assert data[data.columns[0]][0] == start_offset
 
 
+@pytest.mark.pipeline  # Covered
 @pytest.mark.parametrize("use_date_range_clause", [True, False])
-@marks([Marks.pipeline, Marks.storage])
-def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause):
+@pytest.mark.storage
+def test_date_range_row_sliced(basic_store_tiny_segment, use_date_range_clause, any_output_format):
     lib = basic_store_tiny_segment
+    lib._set_output_format_for_pipeline_tests(any_output_format)
     sym = "test_date_range_row_sliced"
     # basic_store_tiny_segment produces 2x2 segments
     num_rows = 6
@@ -1657,7 +1657,7 @@ def test_batch_write_then_list_symbol_without_cache(basic_store_factory):
         assert set(lib.list_symbols()) == set(symbols)
 
 
-@marks([Marks.storage, Marks.dedup])
+@pytest.mark.storage
 def test_batch_write_missing_keys_dedup(basic_store_factory):
     """When there is duplicate data to reuse for the current write, we need to access the index key of the previous
     versions in order to refer to the corresponding keys for the deduplicated data."""
@@ -2217,26 +2217,6 @@ def test_batch_read_meta_multiple_versions(object_version_store):
     assert results_dict["sym3"][0].metadata == {"meta3": 1}
     assert results_dict["sym2"][3].metadata == {"meta2": 4}
 
-    # We can supply only an array of symbols, including repeating symbols
-    results_dict = lib.batch_read_metadata_multi(["sym1", "sym2", "sym1", "sym3", "sym2", "sym1", "sym1"])
-    assert results_dict["sym1"][2].metadata == {"meta1": 3}
-    assert len(results_dict["sym1"]) == 1
-    assert results_dict["sym2"][3].metadata == {"meta2": 4}
-    assert results_dict["sym3"][0].metadata == {"meta3": 1}
-
-    # The lists are of different sizr
-    with pytest.raises(ArcticException):
-        results_dict = lib.batch_read_metadata_multi(["sym1", "sym2"], [0, 0, -2])
-
-    # With negative number we can go back from current versions
-    assert lib.batch_read_metadata_multi(["sym1", "sym1"], [-1, -2]) == lib.batch_read_metadata_multi(
-        ["sym1", "sym1"], [2, 1]
-    )
-
-    # Check DataError is thrown when requesting non-existing version
-    with pytest.raises(TypeError):  # Not a good error though - issue 10070002655
-        results_dict = lib.batch_read_metadata_multi(["sym1"], [10])
-
 
 @pytest.mark.storage
 def test_list_symbols(basic_store):
@@ -2746,10 +2726,12 @@ def test_batch_append_with_throw_exception(basic_store, three_col_df):
         )
 
 
+@pytest.mark.pipeline  # Covered
 @pytest.mark.parametrize("use_date_range_clause", [True, False])
-@marks([Marks.pipeline, Marks.storage])
-def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause):
+@pytest.mark.storage
+def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_range_clause, any_output_format):
     lmdb_version_store = basic_store_tombstone_and_sync_passive
+    lmdb_version_store._set_output_format_for_pipeline_tests(any_output_format)
     symbols = []
     for i in range(5):
         symbols.append("sym_{}".format(i))
@@ -2788,7 +2770,6 @@ def test_batch_read_date_range(basic_store_tombstone_and_sync_passive, use_date_
 
 
 @pytest.mark.parametrize("use_row_range_clause", [True, False])
-@marks([Marks.pipeline])
 def test_batch_read_row_range(lmdb_version_store_v1, use_row_range_clause):
     lib = lmdb_version_store_v1
     num_symbols = 5