diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py index 12bdd35406..7ee2f34caa 100644 --- a/python/tests/hypothesis/arcticdb/test_resample.py +++ b/python/tests/hypothesis/arcticdb/test_resample.py @@ -18,12 +18,21 @@ COLUMN_DTYPE = ["float", "int", "uint"] ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] -MIN_DATE = np.datetime64("1969-06-01") -MAX_DATE = np.datetime64("1970-06-01") +# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well. +MIN_DATE = np.datetime64("1960-01-01") +MAX_DATE = np.datetime64("2025-01-01") pytestmark = pytest.mark.pipeline +def dense_row_count_in_resampled_dataframe(df_list, rule): + """ + The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling + with `rule`. Assumes df_list is sorted by start date and the indexes are not overlapping. + """ + return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value + + @st.composite def date(draw, min_date, max_date, unit="ns"): """ @@ -102,14 +111,14 @@ def freq_fits_in_64_bits(count, unit): This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer. """ billion = 1_000_000_000 - mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion} + mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1} return (mult[unit] * count).bit_length() <= 63 @st.composite def rule(draw): count = draw(st.integers(min_value=1, max_value=10_000)) - unit = draw(st.sampled_from(["min", "h", "s"])) + unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"])) result = f"{count}{unit}" assume(freq_fits_in_64_bits(count=count, unit=unit)) return result @@ -117,7 +126,7 @@ def rule(draw): @st.composite def offset(draw): - unit = draw(st.sampled_from(["s", "min", "h", None])) + unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None])) if unit is None: return None count = draw(st.integers(min_value=1, max_value=100)) @@ -173,6 +182,9 @@ def dynamic_schema_column_list(draw): offset=offset(), ) def test_resample(lmdb_version_store_v1, df, rule, origin, offset): + # The assumption below is to avoid OOM-ing the GitHub runners. + assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000) + lib = lmdb_version_store_v1 sym = "sym" logger = get_logger() @@ -220,6 +232,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset): @given(df_list=dynamic_schema_column_list(), rule=rule(), origin=origin(), offset=offset()) @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large]) def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset): + # The assumption below is to avoid OOM-ing the GitHub runners. + assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000) + common_column_types = compute_common_type_for_columns_in_df_list(df_list) lib = lmdb_version_store_dynamic_schema_v1 lib.version_store.clear()