Fix hypothesis tests that occasionally make the CI runners OOM [9796087821] (#2624)

vasil-pashov · web-flow · commit 9169d2b70190 · 2025-09-23T16:18:11.000+03:00
#### Reference Issues/PRs  Monday 9796087821 #### What does this implement or fix? The issue is that we need to output a dense dataframe for Pandas. So if the date range is big and the frequency is small we can end up with too many rows in the output dataframe eating the memory of the runner. This also allows us to test all possible frequencies and extend the date range for generating dataframes. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py
@@ -18,12 +18,21 @@
 
 COLUMN_DTYPE = ["float", "int", "uint"]
 ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"]
-MIN_DATE = np.datetime64("1969-06-01")
-MAX_DATE = np.datetime64("1970-06-01")
+# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well.
+MIN_DATE = np.datetime64("1960-01-01")
+MAX_DATE = np.datetime64("2025-01-01")
 
 pytestmark = pytest.mark.pipeline
 
 
+def dense_row_count_in_resampled_dataframe(df_list, rule):
+    """
+    The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling
+    with `rule`.  Assumes df_list is sorted by start date and the indexes are not overlapping.
+    """
+    return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value
+
+
 @st.composite
 def date(draw, min_date, max_date, unit="ns"):
     """
@@ -102,22 +111,22 @@ def freq_fits_in_64_bits(count, unit):
     This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer.
     """
     billion = 1_000_000_000
-    mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion}
+    mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1}
     return (mult[unit] * count).bit_length() <= 63
 
 
 @st.composite
 def rule(draw):
     count = draw(st.integers(min_value=1, max_value=10_000))
-    unit = draw(st.sampled_from(["min", "h", "s"]))
+    unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"]))
     result = f"{count}{unit}"
     assume(freq_fits_in_64_bits(count=count, unit=unit))
     return result
 
 
 @st.composite
 def offset(draw):
-    unit = draw(st.sampled_from(["s", "min", "h", None]))
+    unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None]))
     if unit is None:
         return None
     count = draw(st.integers(min_value=1, max_value=100))
@@ -173,6 +182,9 @@ def dynamic_schema_column_list(draw):
     offset=offset(),
 )
 def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
+    # The assumption below is to avoid OOM-ing the GitHub runners.
+    assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000)
+
     lib = lmdb_version_store_v1
     sym = "sym"
     logger = get_logger()
@@ -220,6 +232,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
 @given(df_list=dynamic_schema_column_list(), rule=rule(), origin=origin(), offset=offset())
 @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
 def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
+    # The assumption below is to avoid OOM-ing the GitHub runners.
+    assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000)
+
     common_column_types = compute_common_type_for_columns_in_df_list(df_list)
     lib = lmdb_version_store_dynamic_schema_v1
     lib.version_store.clear()