Fix hypothesis tests that occasionally make the CI runners OOM

vasil-pashov · vasil-pashov · commit a081149fd21a · 2025-08-29T11:32:08.000+03:00
The issue is that we need to output a dense dataframe for Pandas. So if
the date range is big and the frequency is small we can end up with too
many rows in the output dataframe eating the memory of the runner.

This also allows us to test all possible frequencies and extend the date
range for generating dataframes.
diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py
@@ -18,11 +18,18 @@
 
 COLUMN_DTYPE = ["float", "int", "uint"]
 ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"]
-MIN_DATE = np.datetime64('1969-06-01')
-MAX_DATE = np.datetime64('1970-06-01')
+# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well.
+MIN_DATE = np.datetime64('1960-01-01')
+MAX_DATE = np.datetime64('2025-01-01')
 
 pytestmark = pytest.mark.pipeline
 
+def dense_row_count_in_resampled_dataframe(df_list, rule):
+    """
+    The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling
+    with `rule`.  Assumes df_list is sorted by start date and the indexes are not overlapping.
+    """
+    return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value
 
 @st.composite
 def date(draw, min_date, max_date, unit="ns"):
@@ -98,22 +105,22 @@ def freq_fits_in_64_bits(count, unit):
     This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer.
     """
     billion = 1_000_000_000
-    mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion}
+    mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion, 'ms': billion // 1000, 'us' : 1000,'ns': 1}
     return (mult[unit] * count).bit_length() <= 63
 
 
 @st.composite
 def rule(draw):
     count = draw(st.integers(min_value=1, max_value=10_000))
-    unit = draw(st.sampled_from(['min', 'h', 's']))
+    unit = draw(st.sampled_from(['min', 'h', 's', 'ms', 'us', 'ns']))
     result = f"{count}{unit}"
     assume(freq_fits_in_64_bits(count=count, unit=unit))
     return result
 
 
 @st.composite
 def offset(draw):
-    unit = draw(st.sampled_from(['s', 'min', 'h', None]))
+    unit = draw(st.sampled_from(['s', 'min', 'h', 'ms', 'us', 'ns', None]))
     if unit is None:
         return None
     count = draw(st.integers(min_value=1, max_value=100))
@@ -150,6 +157,9 @@ def dynamic_schema_column_list(draw):
     offset=offset()
 )
 def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
+    # The assumption below is to avoid OOM-ing the GitHub runners.
+    assume(dense_row_count_in_resampled_dataframe([df], rule) < 10000)
+
     lib = lmdb_version_store_v1
     sym = "sym"
     logger = get_logger()
@@ -198,6 +208,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
 )
 @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
 def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
+    # The assumption below is to avoid OOM-ing the GitHub runners.
+    assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 10000)
+
     common_column_types = compute_common_type_for_columns_in_df_list(df_list)
     lib = lmdb_version_store_dynamic_schema_v1
     lib.version_store.clear()