From c28301d7313330d6dec31d41879b0735e4f8a257 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 29 Aug 2025 11:01:30 +0300 Subject: [PATCH 1/2] Fix hypothesis tests that occasionally make the CI runners OOM The issue is that we need to output a dense dataframe for Pandas. So if the date range is big and the frequency is small we can end up with too many rows in the output dataframe eating the memory of the runner. This also allows us to test all possible frequencies and extend the date range for generating dataframes. --- .../hypothesis/arcticdb/test_resample.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py index 040a2e1acc..8c8f33c5fc 100644 --- a/python/tests/hypothesis/arcticdb/test_resample.py +++ b/python/tests/hypothesis/arcticdb/test_resample.py @@ -18,11 +18,18 @@ COLUMN_DTYPE = ["float", "int", "uint"] ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] -MIN_DATE = np.datetime64('1969-06-01') -MAX_DATE = np.datetime64('1970-06-01') +# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well. +MIN_DATE = np.datetime64('1960-01-01') +MAX_DATE = np.datetime64('2025-01-01') pytestmark = pytest.mark.pipeline +def dense_row_count_in_resampled_dataframe(df_list, rule): + """ + The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling + with `rule`. Assumes df_list is sorted by start date and the indexes are not overlapping. + """ + return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value @st.composite def date(draw, min_date, max_date, unit="ns"): @@ -98,14 +105,14 @@ def freq_fits_in_64_bits(count, unit): This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer. """ billion = 1_000_000_000 - mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion} + mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion, 'ms': billion // 1000, 'us' : 1000,'ns': 1} return (mult[unit] * count).bit_length() <= 63 @st.composite def rule(draw): count = draw(st.integers(min_value=1, max_value=10_000)) - unit = draw(st.sampled_from(['min', 'h', 's'])) + unit = draw(st.sampled_from(['min', 'h', 's', 'ms', 'us', 'ns'])) result = f"{count}{unit}" assume(freq_fits_in_64_bits(count=count, unit=unit)) return result @@ -113,7 +120,7 @@ def rule(draw): @st.composite def offset(draw): - unit = draw(st.sampled_from(['s', 'min', 'h', None])) + unit = draw(st.sampled_from(['s', 'min', 'h', 'ms', 'us', 'ns', None])) if unit is None: return None count = draw(st.integers(min_value=1, max_value=100)) @@ -150,6 +157,9 @@ def dynamic_schema_column_list(draw): offset=offset() ) def test_resample(lmdb_version_store_v1, df, rule, origin, offset): + # The assumption below is to avoid OOM-ing the GitHub runners. + assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000) + lib = lmdb_version_store_v1 sym = "sym" logger = get_logger() @@ -198,6 +208,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset): ) @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large]) def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset): + # The assumption below is to avoid OOM-ing the GitHub runners. + assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000) + common_column_types = compute_common_type_for_columns_in_df_list(df_list) lib = lmdb_version_store_dynamic_schema_v1 lib.version_store.clear() From 42a63e8069f4e54c1872cb48128751014f4ff232 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 23 Sep 2025 12:58:59 +0300 Subject: [PATCH 2/2] Apply code formatting rules --- python/tests/hypothesis/arcticdb/test_resample.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py index 6a9609a3c9..7ee2f34caa 100644 --- a/python/tests/hypothesis/arcticdb/test_resample.py +++ b/python/tests/hypothesis/arcticdb/test_resample.py @@ -19,11 +19,12 @@ COLUMN_DTYPE = ["float", "int", "uint"] ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"] # Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well. -MIN_DATE = np.datetime64('1960-01-01') -MAX_DATE = np.datetime64('2025-01-01') +MIN_DATE = np.datetime64("1960-01-01") +MAX_DATE = np.datetime64("2025-01-01") pytestmark = pytest.mark.pipeline + def dense_row_count_in_resampled_dataframe(df_list, rule): """ The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling @@ -31,6 +32,7 @@ def dense_row_count_in_resampled_dataframe(df_list, rule): """ return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value + @st.composite def date(draw, min_date, max_date, unit="ns"): """ @@ -109,14 +111,14 @@ def freq_fits_in_64_bits(count, unit): This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer. """ billion = 1_000_000_000 - mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion, 'ms': billion // 1000, 'us' : 1000,'ns': 1} + mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1} return (mult[unit] * count).bit_length() <= 63 @st.composite def rule(draw): count = draw(st.integers(min_value=1, max_value=10_000)) - unit = draw(st.sampled_from(['min', 'h', 's', 'ms', 'us', 'ns'])) + unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"])) result = f"{count}{unit}" assume(freq_fits_in_64_bits(count=count, unit=unit)) return result @@ -124,7 +126,7 @@ def rule(draw): @st.composite def offset(draw): - unit = draw(st.sampled_from(['s', 'min', 'h', 'ms', 'us', 'ns', None])) + unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None])) if unit is None: return None count = draw(st.integers(min_value=1, max_value=100))