Skip to content

Commit fecb862

Browse files
authored
fix(ci-insights): Precisely report flaky detection when exceeding budget (#248)
Unify new test metrics to store only relevant information and also improve the generated report to have real test durations and retries.
1 parent 78863ff commit fecb862

File tree

2 files changed

+202
-37
lines changed

2 files changed

+202
-37
lines changed

pytest_mergify/flaky_detection.py

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import collections
12
import dataclasses
23
import datetime
34
import os
@@ -20,6 +21,34 @@
2021
_MIN_TEST_RETRY_BUDGET_DURATION = datetime.timedelta(seconds=4)
2122

2223

24+
@dataclasses.dataclass
25+
class _NewTestMetrics:
26+
"Represents metrics collected for a new test."
27+
28+
initial_duration: datetime.timedelta = dataclasses.field(
29+
default_factory=datetime.timedelta
30+
)
31+
"Represents the duration of the initial execution of the test."
32+
33+
retry_count: int = dataclasses.field(default=0)
34+
"Represents the number of times the test has been retried so far."
35+
36+
scheduled_retry_count: int = dataclasses.field(default=0)
37+
"Represents the number of retries that have been scheduled for this test depending on the budget."
38+
39+
total_duration: datetime.timedelta = dataclasses.field(
40+
default_factory=datetime.timedelta
41+
)
42+
"Represents the total duration spent executing this test, including retries."
43+
44+
def add_duration(self, duration: datetime.timedelta) -> None:
45+
if not self.initial_duration:
46+
self.initial_duration = duration
47+
48+
self.retry_count += 1
49+
self.total_duration += duration
50+
51+
2352
@dataclasses.dataclass
2453
class FlakyDetector:
2554
token: str
@@ -37,11 +66,8 @@ class FlakyDetector:
3766
init=False,
3867
default_factory=list,
3968
)
40-
_new_test_durations: typing.Dict[str, datetime.timedelta] = dataclasses.field(
41-
init=False, default_factory=dict
42-
)
43-
_new_test_retries: typing.DefaultDict[str, int] = dataclasses.field(
44-
init=False, default_factory=lambda: typing.DefaultDict(int)
69+
_new_test_metrics: typing.Dict[str, _NewTestMetrics] = dataclasses.field(
70+
init=False, default_factory=lambda: collections.defaultdict(_NewTestMetrics)
4571
)
4672
_over_length_tests: typing.Set[str] = dataclasses.field(
4773
init=False, default_factory=set
@@ -93,14 +119,12 @@ def detect_from_report(self, report: _pytest.reports.TestReport) -> bool:
93119
if test in self._existing_tests:
94120
return False
95121

96-
if test in self._new_test_durations:
97-
return True
98-
99122
if len(test) > _MAX_TEST_NAME_LENGTH:
100123
self._over_length_tests.add(test)
101124
return False
102125

103-
self._new_test_durations[test] = duration
126+
self._new_test_metrics[test].add_duration(duration)
127+
104128
return True
105129

106130
def _get_budget_deadline(self) -> datetime.datetime:
@@ -122,15 +146,18 @@ def _get_remaining_items(
122146
# its duration yet, so allocate max retries directly and rely on the
123147
# budget deadline instead of going through the budget allocation.
124148
if (
125-
len(self._new_test_durations) == 0
149+
len(self._new_test_metrics) == 0
126150
and self.last_collected_test
127151
and self.last_collected_test not in self._existing_tests
128152
):
129153
allocation = {self.last_collected_test: _MAX_TEST_RETRY_COUNT}
130154
else:
131155
allocation = _allocate_test_retries(
132156
self._get_budget_duration(),
133-
self._new_test_durations,
157+
{
158+
test: metrics.initial_duration
159+
for test, metrics in self._new_test_metrics.items()
160+
},
134161
)
135162

136163
items_to_retry = [item for item in session.items if item.nodeid in allocation]
@@ -139,7 +166,7 @@ def _get_remaining_items(
139166
for item in items_to_retry:
140167
expected_retries = int(allocation[item.nodeid])
141168
existing_retries = int(
142-
self._new_test_retries.get(item.nodeid, 0),
169+
self._new_test_metrics[item.nodeid].scheduled_retry_count,
143170
)
144171

145172
remaining_retries = max(0, expected_retries - existing_retries)
@@ -149,7 +176,7 @@ def _get_remaining_items(
149176
if not item.parent:
150177
continue
151178

152-
self._new_test_retries[item.nodeid] += 1
179+
self._new_test_metrics[item.nodeid].scheduled_retry_count += 1
153180

154181
clone = item.__class__.from_parent(
155182
name=item.name,
@@ -207,48 +234,57 @@ def handle_item(
207234
def make_report(self) -> str:
208235
result = "🐛 Flaky detection"
209236
if self._over_length_tests:
210-
result += f"{os.linesep}- Skipped {len(self._over_length_tests)} test(s):"
237+
result += (
238+
f"{os.linesep}- Skipped {len(self._over_length_tests)} "
239+
f"test{'s' if len(self._over_length_tests) > 1 else ''}:"
240+
)
211241
for test in self._over_length_tests:
212242
result += (
213-
f"{os.linesep} • '{test}' has not been tested multiple "
214-
f"times because the name of the test exceeds our limit of "
215-
f"{_MAX_TEST_NAME_LENGTH} characters"
243+
f"{os.linesep} • '{test}' has not been tested multiple times because the name of the test "
244+
f"exceeds our limit of {_MAX_TEST_NAME_LENGTH} characters"
216245
)
217246

218-
if not self._new_test_durations:
247+
if not self._new_test_metrics:
219248
result += f"{os.linesep}- No new tests detected, but we are watching 👀"
220249

221250
return result
222251

223252
total_retry_duration_seconds = sum(
224-
self._new_test_durations[test_name].total_seconds() * retry_count
225-
for test_name, retry_count in self._new_test_retries.items()
226-
if retry_count > 0
253+
metrics.total_duration.total_seconds()
254+
for metrics in self._new_test_metrics.values()
227255
)
228256
budget_duration_seconds = self._get_budget_duration().total_seconds()
229257
result += (
230-
f"{os.linesep}- Used {total_retry_duration_seconds / budget_duration_seconds * 100:.2f} % "
231-
f"of the budget ({total_retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
258+
f"{os.linesep}- Used {total_retry_duration_seconds / budget_duration_seconds * 100:.2f} % of the budget "
259+
f"({total_retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
232260
)
233261

234262
result += (
235-
f"{os.linesep}- Active for {len(self._new_test_durations)} new test(s):"
263+
f"{os.linesep}- Active for {len(self._new_test_metrics)} new "
264+
f"test{'s' if len(self._new_test_metrics) > 1 else ''}:"
236265
)
237-
for test, duration in self._new_test_durations.items():
238-
retry_count = self._new_test_retries.get(test, 0)
239-
if retry_count == 0:
240-
result += f"{os.linesep} • '{test}' is too slow to be tested at least {_MIN_TEST_RETRY_COUNT} times within the budget"
241-
continue
242-
elif retry_count < _MIN_TEST_RETRY_COUNT:
243-
result += f"{os.linesep} • '{test}' has been tested only {retry_count} times to avoid exceeding the budget"
266+
for test, metrics in self._new_test_metrics.items():
267+
if metrics.scheduled_retry_count == 0:
268+
result += (
269+
f"{os.linesep} • '{test}' is too slow to be tested at least {_MIN_TEST_RETRY_COUNT} times "
270+
"within the budget"
271+
)
244272
continue
245273

246-
retry_duration_seconds = duration.total_seconds() * retry_count
274+
if metrics.retry_count < metrics.scheduled_retry_count:
275+
result += (
276+
f"{os.linesep} • '{test}' has been tested only {metrics.retry_count} "
277+
f"time{'s' if metrics.retry_count > 1 else ''} instead of {metrics.scheduled_retry_count} "
278+
f"time{'s' if metrics.scheduled_retry_count > 1 else ''} to avoid exceeding the budget"
279+
)
280+
continue
247281

282+
retry_duration_seconds = metrics.total_duration.total_seconds()
248283
result += (
249-
f"{os.linesep} • '{test}' has been tested {retry_count} "
250-
f"times using approx. {retry_duration_seconds / budget_duration_seconds * 100:.2f} % "
251-
f"of the budget ({retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
284+
f"{os.linesep} • '{test}' has been tested {metrics.retry_count} "
285+
f"time{'s' if metrics.retry_count > 1 else ''} using approx. "
286+
f"{retry_duration_seconds / budget_duration_seconds * 100:.2f} % of the budget "
287+
f"({retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
252288
)
253289

254290
return result

tests/test_ci_insights.py

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import datetime
12
import re
23
import typing
34

5+
import _pytest.nodes
6+
import _pytest.pytester
7+
import _pytest.reports
48
import pytest
59
import responses
610
from opentelemetry.sdk import trace
711

12+
import pytest_mergify
813
from pytest_mergify import ci_insights, flaky_detection
914

1015
from . import conftest
@@ -164,10 +169,10 @@ def test_corge():
164169

165170
assert re.search(
166171
r"""🐛 Flaky detection
167-
- Skipped 1 test\(s\):
172+
- Skipped 1 test:
168173
• 'test_flaky_detection\.py::test_quux_[a]+' has not been tested multiple times because the name of the test exceeds our limit of \d+ characters
169174
- Used [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
170-
- Active for 3 new test\(s\):
175+
- Active for 3 new tests:
171176
• 'test_flaky_detection\.py::test_bar' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
172177
• 'test_flaky_detection\.py::test_baz' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
173178
• 'test_flaky_detection\.py::test_corge' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)""",
@@ -264,6 +269,130 @@ async def test_bar():
264269
}
265270

266271

272+
@responses.activate
273+
def test_flaky_detection_slow_test_not_retried(
274+
monkeypatch: pytest.MonkeyPatch,
275+
pytester: _pytest.pytester.Pytester,
276+
) -> None:
277+
"""
278+
Test that a slow test is not retried when it can't reach
279+
`flaky_detection._MIN_TEST_RETRY_COUNT` within the budget.
280+
"""
281+
_set_test_environment(monkeypatch)
282+
_make_quarantine_mock()
283+
_make_test_names_mock(
284+
[
285+
"test_flaky_detection_slow_test_not_retried.py::test_existing",
286+
]
287+
)
288+
289+
class CustomPlugin:
290+
def pytest_runtest_makereport(
291+
self,
292+
item: _pytest.nodes.Item,
293+
call: _pytest.reports.TestReport,
294+
) -> None:
295+
if call.when != "call":
296+
return
297+
298+
if "test_slow" in item.nodeid:
299+
call.duration = 10.0 # Simulate a slow test.
300+
else:
301+
call.duration = 0.001
302+
303+
pytester.makepyfile(
304+
"""
305+
def test_existing():
306+
assert True
307+
308+
def test_fast():
309+
assert True
310+
311+
def test_slow():
312+
assert True
313+
"""
314+
)
315+
316+
result = pytester.runpytest_inprocess(
317+
plugins=[CustomPlugin(), pytest_mergify.PytestMergify()]
318+
)
319+
result.assert_outcomes(passed=1003)
320+
321+
# `test_fast` should have been tested successfully.
322+
assert re.search(
323+
r"'test_flaky_detection_slow_test_not_retried\.py::test_fast' has been tested \d+ times",
324+
result.stdout.str(),
325+
)
326+
327+
assert (
328+
f"'test_flaky_detection_slow_test_not_retried.py::test_slow' is too slow to be tested at least {flaky_detection._MIN_TEST_RETRY_COUNT} times within the budget"
329+
in result.stdout.str()
330+
)
331+
332+
333+
@responses.activate
334+
def test_flaky_detection_budget_deadline_stops_retries(
335+
monkeypatch: pytest.MonkeyPatch,
336+
pytester: _pytest.pytester.Pytester,
337+
) -> None:
338+
"""
339+
Test that retries are stopped when they would exceed the budget deadline.
340+
"""
341+
_set_test_environment(monkeypatch)
342+
_make_quarantine_mock()
343+
_make_test_names_mock(
344+
[
345+
"test_flaky_detection_budget_deadline_stops_retries.py::test_existing",
346+
]
347+
)
348+
349+
class CustomPlugin:
350+
deadline_patched: bool = False
351+
352+
def pytest_runtest_protocol(self, item: _pytest.nodes.Item) -> None:
353+
plugin = None
354+
for existing in item.session.config.pluginmanager.get_plugins():
355+
if isinstance(existing, pytest_mergify.PytestMergify):
356+
plugin = existing
357+
358+
if not plugin or not plugin.mergify_ci.flaky_detector:
359+
return
360+
361+
# The deadline is set so we started detecting flaky tests.
362+
if plugin.mergify_ci.flaky_detector._deadline and not self.deadline_patched:
363+
# Set the deadline in the past to stop immediately.
364+
plugin.mergify_ci.flaky_detector._deadline = datetime.datetime.now(
365+
datetime.timezone.utc
366+
) - datetime.timedelta(hours=1)
367+
368+
self.deadline_patched = True
369+
370+
pytester.makepyfile(
371+
"""
372+
def test_existing():
373+
assert True
374+
375+
def test_new():
376+
assert True
377+
"""
378+
)
379+
380+
result = pytester.runpytest_inprocess(
381+
plugins=[pytest_mergify.PytestMergify(), CustomPlugin()]
382+
)
383+
384+
# We should have:
385+
# - 1 execution of `test_existing`,
386+
# - 1 initial execution of `test_new`,
387+
# - Only 1 retry of `test_new` before the deadline is reached.
388+
result.assert_outcomes(passed=3)
389+
390+
assert re.search(
391+
r"'test_flaky_detection_budget_deadline_stops_retries\.py::test_new' has been tested only \d+ times instead of \d+ times to avoid exceeding the budget",
392+
result.stdout.str(),
393+
)
394+
395+
267396
def _get_span_counts(
268397
spans: typing.Dict[str, trace.ReadableSpan],
269398
) -> typing.Dict[str, int]:

0 commit comments

Comments
 (0)