fix(ci-insights): Precisely report flaky detection when exceeding budget (#248)

remyduthu · web-flow · commit fecb862c40d9 · 2025-10-16T09:28:24.000Z
Unify new test metrics to store only relevant information and also
improve the generated report to have real test durations and retries.
diff --git a/pytest_mergify/flaky_detection.py b/pytest_mergify/flaky_detection.py
@@ -1,3 +1,4 @@
+import collections
 import dataclasses
 import datetime
 import os
@@ -20,6 +21,34 @@
 _MIN_TEST_RETRY_BUDGET_DURATION = datetime.timedelta(seconds=4)
 
 
+@dataclasses.dataclass
+class _NewTestMetrics:
+    "Represents metrics collected for a new test."
+
+    initial_duration: datetime.timedelta = dataclasses.field(
+        default_factory=datetime.timedelta
+    )
+    "Represents the duration of the initial execution of the test."
+
+    retry_count: int = dataclasses.field(default=0)
+    "Represents the number of times the test has been retried so far."
+
+    scheduled_retry_count: int = dataclasses.field(default=0)
+    "Represents the number of retries that have been scheduled for this test depending on the budget."
+
+    total_duration: datetime.timedelta = dataclasses.field(
+        default_factory=datetime.timedelta
+    )
+    "Represents the total duration spent executing this test, including retries."
+
+    def add_duration(self, duration: datetime.timedelta) -> None:
+        if not self.initial_duration:
+            self.initial_duration = duration
+
+        self.retry_count += 1
+        self.total_duration += duration
+
+
 @dataclasses.dataclass
 class FlakyDetector:
     token: str
@@ -37,11 +66,8 @@ class FlakyDetector:
         init=False,
         default_factory=list,
     )
-    _new_test_durations: typing.Dict[str, datetime.timedelta] = dataclasses.field(
-        init=False, default_factory=dict
-    )
-    _new_test_retries: typing.DefaultDict[str, int] = dataclasses.field(
-        init=False, default_factory=lambda: typing.DefaultDict(int)
+    _new_test_metrics: typing.Dict[str, _NewTestMetrics] = dataclasses.field(
+        init=False, default_factory=lambda: collections.defaultdict(_NewTestMetrics)
     )
     _over_length_tests: typing.Set[str] = dataclasses.field(
         init=False, default_factory=set
@@ -93,14 +119,12 @@ def detect_from_report(self, report: _pytest.reports.TestReport) -> bool:
         if test in self._existing_tests:
             return False
 
-        if test in self._new_test_durations:
-            return True
-
         if len(test) > _MAX_TEST_NAME_LENGTH:
             self._over_length_tests.add(test)
             return False
 
-        self._new_test_durations[test] = duration
+        self._new_test_metrics[test].add_duration(duration)
+
         return True
 
     def _get_budget_deadline(self) -> datetime.datetime:
@@ -122,15 +146,18 @@ def _get_remaining_items(
         # its duration yet, so allocate max retries directly and rely on the
         # budget deadline instead of going through the budget allocation.
         if (
-            len(self._new_test_durations) == 0
+            len(self._new_test_metrics) == 0
             and self.last_collected_test
             and self.last_collected_test not in self._existing_tests
         ):
             allocation = {self.last_collected_test: _MAX_TEST_RETRY_COUNT}
         else:
             allocation = _allocate_test_retries(
                 self._get_budget_duration(),
-                self._new_test_durations,
+                {
+                    test: metrics.initial_duration
+                    for test, metrics in self._new_test_metrics.items()
+                },
             )
 
         items_to_retry = [item for item in session.items if item.nodeid in allocation]
@@ -139,7 +166,7 @@ def _get_remaining_items(
         for item in items_to_retry:
             expected_retries = int(allocation[item.nodeid])
             existing_retries = int(
-                self._new_test_retries.get(item.nodeid, 0),
+                self._new_test_metrics[item.nodeid].scheduled_retry_count,
             )
 
             remaining_retries = max(0, expected_retries - existing_retries)
@@ -149,7 +176,7 @@ def _get_remaining_items(
                 if not item.parent:
                     continue
 
-                self._new_test_retries[item.nodeid] += 1
+                self._new_test_metrics[item.nodeid].scheduled_retry_count += 1
 
                 clone = item.__class__.from_parent(
                     name=item.name,
@@ -207,48 +234,57 @@ def handle_item(
     def make_report(self) -> str:
         result = "🐛 Flaky detection"
         if self._over_length_tests:
-            result += f"{os.linesep}- Skipped {len(self._over_length_tests)} test(s):"
+            result += (
+                f"{os.linesep}- Skipped {len(self._over_length_tests)} "
+                f"test{'s' if len(self._over_length_tests) > 1 else ''}:"
+            )
             for test in self._over_length_tests:
                 result += (
-                    f"{os.linesep}    • '{test}' has not been tested multiple "
-                    f"times because the name of the test exceeds our limit of "
-                    f"{_MAX_TEST_NAME_LENGTH} characters"
+                    f"{os.linesep}    • '{test}' has not been tested multiple times because the name of the test "
+                    f"exceeds our limit of {_MAX_TEST_NAME_LENGTH} characters"
                 )
 
-        if not self._new_test_durations:
+        if not self._new_test_metrics:
             result += f"{os.linesep}- No new tests detected, but we are watching 👀"
 
             return result
 
         total_retry_duration_seconds = sum(
-            self._new_test_durations[test_name].total_seconds() * retry_count
-            for test_name, retry_count in self._new_test_retries.items()
-            if retry_count > 0
+            metrics.total_duration.total_seconds()
+            for metrics in self._new_test_metrics.values()
         )
         budget_duration_seconds = self._get_budget_duration().total_seconds()
         result += (
-            f"{os.linesep}- Used {total_retry_duration_seconds / budget_duration_seconds * 100:.2f} % "
-            f"of the budget ({total_retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
+            f"{os.linesep}- Used {total_retry_duration_seconds / budget_duration_seconds * 100:.2f} % of the budget "
+            f"({total_retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
         )
 
         result += (
-            f"{os.linesep}- Active for {len(self._new_test_durations)} new test(s):"
+            f"{os.linesep}- Active for {len(self._new_test_metrics)} new "
+            f"test{'s' if len(self._new_test_metrics) > 1 else ''}:"
         )
-        for test, duration in self._new_test_durations.items():
-            retry_count = self._new_test_retries.get(test, 0)
-            if retry_count == 0:
-                result += f"{os.linesep}    • '{test}' is too slow to be tested at least {_MIN_TEST_RETRY_COUNT} times within the budget"
-                continue
-            elif retry_count < _MIN_TEST_RETRY_COUNT:
-                result += f"{os.linesep}    • '{test}' has been tested only {retry_count} times to avoid exceeding the budget"
+        for test, metrics in self._new_test_metrics.items():
+            if metrics.scheduled_retry_count == 0:
+                result += (
+                    f"{os.linesep}    • '{test}' is too slow to be tested at least {_MIN_TEST_RETRY_COUNT} times "
+                    "within the budget"
+                )
                 continue
 
-            retry_duration_seconds = duration.total_seconds() * retry_count
+            if metrics.retry_count < metrics.scheduled_retry_count:
+                result += (
+                    f"{os.linesep}    • '{test}' has been tested only {metrics.retry_count} "
+                    f"time{'s' if metrics.retry_count > 1 else ''} instead of {metrics.scheduled_retry_count} "
+                    f"time{'s' if metrics.scheduled_retry_count > 1 else ''} to avoid exceeding the budget"
+                )
+                continue
 
+            retry_duration_seconds = metrics.total_duration.total_seconds()
             result += (
-                f"{os.linesep}    • '{test}' has been tested {retry_count} "
-                f"times using approx. {retry_duration_seconds / budget_duration_seconds * 100:.2f} % "
-                f"of the budget ({retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
+                f"{os.linesep}    • '{test}' has been tested {metrics.retry_count} "
+                f"time{'s' if metrics.retry_count > 1 else ''} using approx. "
+                f"{retry_duration_seconds / budget_duration_seconds * 100:.2f} % of the budget "
+                f"({retry_duration_seconds:.2f} s/{budget_duration_seconds:.2f} s)"
             )
 
         return result
diff --git a/tests/test_ci_insights.py b/tests/test_ci_insights.py
@@ -1,10 +1,15 @@
+import datetime
 import re
 import typing
 
+import _pytest.nodes
+import _pytest.pytester
+import _pytest.reports
 import pytest
 import responses
 from opentelemetry.sdk import trace
 
+import pytest_mergify
 from pytest_mergify import ci_insights, flaky_detection
 
 from . import conftest
@@ -164,10 +169,10 @@ def test_corge():
 
     assert re.search(
         r"""🐛 Flaky detection
-- Skipped 1 test\(s\):
+- Skipped 1 test:
     • 'test_flaky_detection\.py::test_quux_[a]+' has not been tested multiple times because the name of the test exceeds our limit of \d+ characters
 - Used [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
-- Active for 3 new test\(s\):
+- Active for 3 new tests:
     • 'test_flaky_detection\.py::test_bar' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
     • 'test_flaky_detection\.py::test_baz' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)
     • 'test_flaky_detection\.py::test_corge' has been tested \d+ times using approx\. [0-9.]+ % of the budget \([0-9.]+ s/[0-9.]+ s\)""",
@@ -264,6 +269,130 @@ async def test_bar():
     }
 
 
+@responses.activate
+def test_flaky_detection_slow_test_not_retried(
+    monkeypatch: pytest.MonkeyPatch,
+    pytester: _pytest.pytester.Pytester,
+) -> None:
+    """
+    Test that a slow test is not retried when it can't reach
+    `flaky_detection._MIN_TEST_RETRY_COUNT` within the budget.
+    """
+    _set_test_environment(monkeypatch)
+    _make_quarantine_mock()
+    _make_test_names_mock(
+        [
+            "test_flaky_detection_slow_test_not_retried.py::test_existing",
+        ]
+    )
+
+    class CustomPlugin:
+        def pytest_runtest_makereport(
+            self,
+            item: _pytest.nodes.Item,
+            call: _pytest.reports.TestReport,
+        ) -> None:
+            if call.when != "call":
+                return
+
+            if "test_slow" in item.nodeid:
+                call.duration = 10.0  # Simulate a slow test.
+            else:
+                call.duration = 0.001
+
+    pytester.makepyfile(
+        """
+        def test_existing():
+            assert True
+
+        def test_fast():
+            assert True
+
+        def test_slow():
+            assert True
+        """
+    )
+
+    result = pytester.runpytest_inprocess(
+        plugins=[CustomPlugin(), pytest_mergify.PytestMergify()]
+    )
+    result.assert_outcomes(passed=1003)
+
+    # `test_fast` should have been tested successfully.
+    assert re.search(
+        r"'test_flaky_detection_slow_test_not_retried\.py::test_fast' has been tested \d+ times",
+        result.stdout.str(),
+    )
+
+    assert (
+        f"'test_flaky_detection_slow_test_not_retried.py::test_slow' is too slow to be tested at least {flaky_detection._MIN_TEST_RETRY_COUNT} times within the budget"
+        in result.stdout.str()
+    )
+
+
+@responses.activate
+def test_flaky_detection_budget_deadline_stops_retries(
+    monkeypatch: pytest.MonkeyPatch,
+    pytester: _pytest.pytester.Pytester,
+) -> None:
+    """
+    Test that retries are stopped when they would exceed the budget deadline.
+    """
+    _set_test_environment(monkeypatch)
+    _make_quarantine_mock()
+    _make_test_names_mock(
+        [
+            "test_flaky_detection_budget_deadline_stops_retries.py::test_existing",
+        ]
+    )
+
+    class CustomPlugin:
+        deadline_patched: bool = False
+
+        def pytest_runtest_protocol(self, item: _pytest.nodes.Item) -> None:
+            plugin = None
+            for existing in item.session.config.pluginmanager.get_plugins():
+                if isinstance(existing, pytest_mergify.PytestMergify):
+                    plugin = existing
+
+            if not plugin or not plugin.mergify_ci.flaky_detector:
+                return
+
+            # The deadline is set so we started detecting flaky tests.
+            if plugin.mergify_ci.flaky_detector._deadline and not self.deadline_patched:
+                # Set the deadline in the past to stop immediately.
+                plugin.mergify_ci.flaky_detector._deadline = datetime.datetime.now(
+                    datetime.timezone.utc
+                ) - datetime.timedelta(hours=1)
+
+                self.deadline_patched = True
+
+    pytester.makepyfile(
+        """
+        def test_existing():
+            assert True
+
+        def test_new():
+            assert True
+        """
+    )
+
+    result = pytester.runpytest_inprocess(
+        plugins=[pytest_mergify.PytestMergify(), CustomPlugin()]
+    )
+
+    # We should have:
+    # - 1 execution of `test_existing`,
+    # - 1 initial execution of `test_new`,
+    # - Only 1 retry of `test_new` before the deadline is reached.
+    result.assert_outcomes(passed=3)
+
+    assert re.search(
+        r"'test_flaky_detection_budget_deadline_stops_retries\.py::test_new' has been tested only \d+ times instead of \d+ times to avoid exceeding the budget",
+        result.stdout.str(),
+    )
+
+
 def _get_span_counts(
     spans: typing.Dict[str, trace.ReadableSpan],
 ) -> typing.Dict[str, int]: