Skip to content

Commit a081149

Browse files
committed
Fix hypothesis tests that occasionally make the CI runners OOM
The issue is that we need to output a dense dataframe for Pandas. So if the date range is big and the frequency is small we can end up with too many rows in the output dataframe eating the memory of the runner. This also allows us to test all possible frequencies and extend the date range for generating dataframes.
1 parent 478e5ea commit a081149

File tree

1 file changed

+18
-5
lines changed

1 file changed

+18
-5
lines changed

python/tests/hypothesis/arcticdb/test_resample.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,18 @@
1818

1919
COLUMN_DTYPE = ["float", "int", "uint"]
2020
ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"]
21-
MIN_DATE = np.datetime64('1969-06-01')
22-
MAX_DATE = np.datetime64('1970-06-01')
21+
# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well.
22+
MIN_DATE = np.datetime64('1960-01-01')
23+
MAX_DATE = np.datetime64('2025-01-01')
2324

2425
pytestmark = pytest.mark.pipeline
2526

27+
def dense_row_count_in_resampled_dataframe(df_list, rule):
28+
"""
29+
The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling
30+
with `rule`. Assumes df_list is sorted by start date and the indexes are not overlapping.
31+
"""
32+
return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value
2633

2734
@st.composite
2835
def date(draw, min_date, max_date, unit="ns"):
@@ -98,22 +105,22 @@ def freq_fits_in_64_bits(count, unit):
98105
This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer.
99106
"""
100107
billion = 1_000_000_000
101-
mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion}
108+
mult = {'h': 3600 * billion, 'min': 60 * billion, 's': billion, 'ms': billion // 1000, 'us' : 1000,'ns': 1}
102109
return (mult[unit] * count).bit_length() <= 63
103110

104111

105112
@st.composite
106113
def rule(draw):
107114
count = draw(st.integers(min_value=1, max_value=10_000))
108-
unit = draw(st.sampled_from(['min', 'h', 's']))
115+
unit = draw(st.sampled_from(['min', 'h', 's', 'ms', 'us', 'ns']))
109116
result = f"{count}{unit}"
110117
assume(freq_fits_in_64_bits(count=count, unit=unit))
111118
return result
112119

113120

114121
@st.composite
115122
def offset(draw):
116-
unit = draw(st.sampled_from(['s', 'min', 'h', None]))
123+
unit = draw(st.sampled_from(['s', 'min', 'h', 'ms', 'us', 'ns', None]))
117124
if unit is None:
118125
return None
119126
count = draw(st.integers(min_value=1, max_value=100))
@@ -150,6 +157,9 @@ def dynamic_schema_column_list(draw):
150157
offset=offset()
151158
)
152159
def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
160+
# The assumption below is to avoid OOM-ing the GitHub runners.
161+
assume(dense_row_count_in_resampled_dataframe([df], rule) < 10000)
162+
153163
lib = lmdb_version_store_v1
154164
sym = "sym"
155165
logger = get_logger()
@@ -198,6 +208,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
198208
)
199209
@settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
200210
def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
211+
# The assumption below is to avoid OOM-ing the GitHub runners.
212+
assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 10000)
213+
201214
common_column_types = compute_common_type_for_columns_in_df_list(df_list)
202215
lib = lmdb_version_store_dynamic_schema_v1
203216
lib.version_store.clear()

0 commit comments

Comments
 (0)