Skip to content

Commit 9169d2b

Browse files
authored
Fix hypothesis tests that occasionally make the CI runners OOM [9796087821] (#2624)
#### Reference Issues/PRs <!--Example: Fixes #1234. See also #3456.--> Monday 9796087821 #### What does this implement or fix? The issue is that we need to output a dense dataframe for Pandas. So if the date range is big and the frequency is small we can end up with too many rows in the output dataframe eating the memory of the runner. This also allows us to test all possible frequencies and extend the date range for generating dataframes. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent 3c7a4bb commit 9169d2b

File tree

1 file changed

+20
-5
lines changed

1 file changed

+20
-5
lines changed

python/tests/hypothesis/arcticdb/test_resample.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,21 @@
1818

1919
COLUMN_DTYPE = ["float", "int", "uint"]
2020
ALL_AGGREGATIONS = ["sum", "mean", "min", "max", "first", "last", "count"]
21-
MIN_DATE = np.datetime64("1969-06-01")
22-
MAX_DATE = np.datetime64("1970-06-01")
21+
# Make sure the start date is pre-epoch so that we can test pre-epoch dates. Not all C++ libraries handle pre-epoch well.
22+
MIN_DATE = np.datetime64("1960-01-01")
23+
MAX_DATE = np.datetime64("2025-01-01")
2324

2425
pytestmark = pytest.mark.pipeline
2526

2627

28+
def dense_row_count_in_resampled_dataframe(df_list, rule):
29+
"""
30+
The number of rows Arctic's resampling will produce after appending all dataframes in `df_list` and then resampling
31+
with `rule`. Assumes df_list is sorted by start date and the indexes are not overlapping.
32+
"""
33+
return (df_list[-1].index[-1] - df_list[0].index[0]).value // pd.Timedelta(rule).value
34+
35+
2736
@st.composite
2837
def date(draw, min_date, max_date, unit="ns"):
2938
"""
@@ -102,22 +111,22 @@ def freq_fits_in_64_bits(count, unit):
102111
This is used to check if a frequency is usable by Arctic. ArcticDB converts the frequency to signed 64-bit integer.
103112
"""
104113
billion = 1_000_000_000
105-
mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion}
114+
mult = {"h": 3600 * billion, "min": 60 * billion, "s": billion, "ms": billion // 1000, "us": 1000, "ns": 1}
106115
return (mult[unit] * count).bit_length() <= 63
107116

108117

109118
@st.composite
110119
def rule(draw):
111120
count = draw(st.integers(min_value=1, max_value=10_000))
112-
unit = draw(st.sampled_from(["min", "h", "s"]))
121+
unit = draw(st.sampled_from(["min", "h", "s", "ms", "us", "ns"]))
113122
result = f"{count}{unit}"
114123
assume(freq_fits_in_64_bits(count=count, unit=unit))
115124
return result
116125

117126

118127
@st.composite
119128
def offset(draw):
120-
unit = draw(st.sampled_from(["s", "min", "h", None]))
129+
unit = draw(st.sampled_from(["s", "min", "h", "ms", "us", "ns", None]))
121130
if unit is None:
122131
return None
123132
count = draw(st.integers(min_value=1, max_value=100))
@@ -173,6 +182,9 @@ def dynamic_schema_column_list(draw):
173182
offset=offset(),
174183
)
175184
def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
185+
# The assumption below is to avoid OOM-ing the GitHub runners.
186+
assume(dense_row_count_in_resampled_dataframe([df], rule) < 150000)
187+
176188
lib = lmdb_version_store_v1
177189
sym = "sym"
178190
logger = get_logger()
@@ -220,6 +232,9 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
220232
@given(df_list=dynamic_schema_column_list(), rule=rule(), origin=origin(), offset=offset())
221233
@settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
222234
def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
235+
# The assumption below is to avoid OOM-ing the GitHub runners.
236+
assume(dense_row_count_in_resampled_dataframe(df_list, rule) < 150000)
237+
223238
common_column_types = compute_common_type_for_columns_in_df_list(df_list)
224239
lib = lmdb_version_store_dynamic_schema_v1
225240
lib.version_store.clear()

0 commit comments

Comments
 (0)