fix(ci-insights): Use all phases to measure duration of a test run (#276)

remyduthu · web-flow · commit 753ea41b2677 · 2025-11-19T09:29:05.000Z
The goal of this change is to precisely measure duration of a test run
by including all the phases (setup, call, teardown) instead of using
only the call phase. We'll be better at computing the budget with this
improvement.

Fixes: MRGFY-6111
diff --git a/pytest_mergify/flaky_detection.py b/pytest_mergify/flaky_detection.py
@@ -37,10 +37,27 @@ def min_budget_duration(self) -> datetime.timedelta:
 class _TestMetrics:
     "Represents metrics collected for a test."
 
-    initial_duration: datetime.timedelta = dataclasses.field(
+    initial_setup_duration: datetime.timedelta = dataclasses.field(
         default_factory=datetime.timedelta
     )
-    "Represents the duration of the initial execution of the test."
+    initial_call_duration: datetime.timedelta = dataclasses.field(
+        default_factory=datetime.timedelta
+    )
+    initial_teardown_duration: datetime.timedelta = dataclasses.field(
+        default_factory=datetime.timedelta
+    )
+
+    @property
+    def initial_duration(self) -> datetime.timedelta:
+        """
+        Represents the duration of the initial run of the test including the 3
+        phases of the protocol (setup, call, teardown).
+        """
+        return (
+            self.initial_setup_duration
+            + self.initial_call_duration
+            + self.initial_teardown_duration
+        )
 
     # NOTE(remyduthu): We need this flag because we may have processed a test
     # without scheduling reruns for it (e.g., because it was too slow).
@@ -59,11 +76,19 @@ class _TestMetrics:
     )
     "Represents the total duration spent executing this test, including reruns."
 
-    def add_duration(self, duration: datetime.timedelta) -> None:
-        if not self.initial_duration:
-            self.initial_duration = duration
+    def fill_from_report(self, report: _pytest.reports.TestReport) -> None:
+        duration = datetime.timedelta(seconds=report.duration)
+
+        if report.when == "setup" and not self.initial_setup_duration:
+            self.initial_setup_duration = duration
+        elif report.when == "call" and not self.initial_call_duration:
+            self.initial_call_duration = duration
+        elif report.when == "teardown" and not self.initial_teardown_duration:
+            self.initial_teardown_duration = duration
+
+        if report.when == "call":
+            self.rerun_count += 1
 
-        self.rerun_count += 1
         self.total_duration += duration
 
     def expected_duration(self) -> datetime.timedelta:
@@ -141,9 +166,6 @@ def _fetch_context(self) -> _FlakyDetectionContext:
         return result
 
     def detect_from_report(self, report: _pytest.reports.TestReport) -> bool:
-        if report.when != "call":
-            return False
-
         if report.outcome not in ["failed", "passed"]:
             return False
 
@@ -161,7 +183,7 @@ def detect_from_report(self, report: _pytest.reports.TestReport) -> bool:
             return False
 
         metrics = self._test_metrics.setdefault(test, _TestMetrics())
-        metrics.add_duration(datetime.timedelta(seconds=report.duration))
+        metrics.fill_from_report(report)
 
         return True
 
diff --git a/tests/test_flaky_detection.py b/tests/test_flaky_detection.py
@@ -1,6 +1,8 @@
 import datetime
 import typing
 
+import _pytest
+import _pytest.reports
 import freezegun
 
 from pytest_mergify import flaky_detection
@@ -22,6 +24,7 @@ def __init__(self) -> None:
         self.url = ""
         self.full_repository_name = ""
         self.mode = "new"
+        self._test_metrics = {}
 
     def __post_init__(self) -> None:
         pass
@@ -59,6 +62,38 @@ def test_flaky_detector_get_duration_before_deadline() -> None:
     assert detector._get_duration_before_deadline() == datetime.timedelta(seconds=10)
 
 
+def test_flaky_detector_detect_from_report() -> None:
+    def make_report(
+        nodeid: str, when: typing.Literal["setup", "call", "teardown"], duration: float
+    ) -> _pytest.reports.TestReport:
+        return _pytest.reports.TestReport(
+            duration=duration,
+            keywords={},
+            location=("", None, ""),
+            longrepr=None,
+            nodeid=nodeid,
+            outcome="passed",
+            when=when,
+        )
+
+    detector = InitializedFlakyDetector()
+    detector._context = _make_flaky_detection_context(max_test_name_length=100)
+
+    detector.detect_from_report(make_report(nodeid="foo", when="setup", duration=1))
+    detector.detect_from_report(make_report(nodeid="foo", when="call", duration=2))
+    detector.detect_from_report(make_report(nodeid="foo", when="teardown", duration=3))
+
+    detector.detect_from_report(make_report(nodeid="foo", when="setup", duration=4))
+    detector.detect_from_report(make_report(nodeid="foo", when="call", duration=5))
+    detector.detect_from_report(make_report(nodeid="foo", when="teardown", duration=6))
+
+    metrics = detector._test_metrics.get("foo")
+    assert metrics is not None
+    assert metrics.initial_duration == datetime.timedelta(seconds=6)
+    assert metrics.rerun_count == 2
+    assert metrics.total_duration == datetime.timedelta(seconds=21)
+
+
 def test_flaky_detector_count_remaining_tests() -> None:
     detector = InitializedFlakyDetector()
     detector._test_metrics = {
@@ -79,11 +114,11 @@ def test_flaky_detector_get_rerun_count_for_test() -> None:
     )
     detector._test_metrics = {
         "foo": flaky_detection._TestMetrics(
-            initial_duration=datetime.timedelta(milliseconds=10),
+            initial_call_duration=datetime.timedelta(milliseconds=10),
             is_processed=True,
         ),
         "bar": flaky_detection._TestMetrics(
-            initial_duration=datetime.timedelta(milliseconds=100),
+            initial_call_duration=datetime.timedelta(milliseconds=100),
         ),
         "baz": flaky_detection._TestMetrics(),
     }
@@ -103,11 +138,11 @@ def test_flaky_detector_get_rerun_count_for_test_with_slow_test() -> None:
     detector._test_metrics = {
         "foo": flaky_detection._TestMetrics(
             # Can't be reran 5 times within the budget.
-            initial_duration=datetime.timedelta(seconds=1),
+            initial_call_duration=datetime.timedelta(seconds=1),
         ),
         "bar": flaky_detection._TestMetrics(
             # This test should not be impacted by the previous one.
-            initial_duration=datetime.timedelta(milliseconds=1),
+            initial_call_duration=datetime.timedelta(milliseconds=1),
         ),
     }
     detector.set_deadline()
@@ -128,7 +163,7 @@ def test_flaky_detector_get_rerun_count_for_test_with_fast_test() -> None:
     detector._test_metrics = {
         "foo": flaky_detection._TestMetrics(
             # Should only be reran 1000 times, freeing the rest of the budget for other tests.
-            initial_duration=datetime.timedelta(milliseconds=1),
+            initial_call_duration=datetime.timedelta(milliseconds=1),
         ),
     }
     detector.set_deadline()
@@ -146,7 +181,7 @@ def test_flaky_detector_get_rerun_count_for_test_with_timeout() -> None:
     )
     detector._test_metrics = {
         "foo": flaky_detection._TestMetrics(
-            initial_duration=datetime.timedelta(milliseconds=4),
+            initial_call_duration=datetime.timedelta(milliseconds=4),
         ),
     }
     detector.set_deadline()