fix(ci_visibility): count failed/skipped tests in JUnit XML when retries are enabled [backport 3.2] (#12899)

vitor-de-araujo · gnufede · web-flow · commit 77865e61934d · 2025-04-01T09:13:57.000Z
Backport 2a506fb from #12862 to 3.2. The pytest JUnit XML plugin uses the test report's [`failed`](https://github.com/pytest-dev/pytest/blob/8.3.x/src/_pytest/junitxml.py#L562) and [`longrepr`](https://github.com/pytest-dev/pytest/blob/8.3.x/src/_pytest/junitxml.py#L201) properties to count failed tests and include them in the output. Because retried tests have their own special statuses (`dd_efd_final_failed`, etc), they don't count as failures, and are excluded from the JUnit XML count. This PR creates a subclass of TestReport that is aware of those special statuses and reports them as passed/failed/skipped accordingly. This is honestly a bit of a hack. It would probably be best to rewrite the retry logic entirely so it would use normal pytest states, and pass the information that they are retries in some other way. But that will take more time, and I would like to fix the bug sooner rather than later. The exception information for the initial attempt is included in the JUnit XML. Known issue: quarantined failing tests are not counted. The way forward with this is to rewrite the retry logic, which I plan to do in a future PR. - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: Federico Mon <federico.mon@datadoghq.com> (cherry picked from commit 2a506fb)
diff --git a/ddtrace/contrib/internal/pytest/_atr_utils.py b/ddtrace/contrib/internal/pytest/_atr_utils.py
@@ -4,6 +4,7 @@
 import pytest
 
 from ddtrace.contrib.internal.pytest._retry_utils import RetryOutcomes
+from ddtrace.contrib.internal.pytest._retry_utils import RetryTestReport
 from ddtrace.contrib.internal.pytest._retry_utils import _get_outcome_from_retry
 from ddtrace.contrib.internal.pytest._retry_utils import _get_retry_attempt_string
 from ddtrace.contrib.internal.pytest._retry_utils import set_retry_num
@@ -79,13 +80,14 @@ def atr_handle_retries(
         return
 
     atr_outcome = _atr_do_retries(item, outcomes)
+    longrepr = InternalTest.stash_get(test_id, "failure_longrepr")
 
-    final_report = pytest_TestReport(
+    final_report = RetryTestReport(
         nodeid=item.nodeid,
         location=item.location,
         keywords=item.keywords,
         when="call",
-        longrepr=None,
+        longrepr=longrepr,
         outcome=final_outcomes[atr_outcome],
     )
     item.ihook.pytest_runtest_logreport(report=final_report)
diff --git a/ddtrace/contrib/internal/pytest/_efd_utils.py b/ddtrace/contrib/internal/pytest/_efd_utils.py
@@ -4,6 +4,7 @@
 import pytest
 
 from ddtrace.contrib.internal.pytest._retry_utils import RetryOutcomes
+from ddtrace.contrib.internal.pytest._retry_utils import RetryTestReport
 from ddtrace.contrib.internal.pytest._retry_utils import _get_outcome_from_retry
 from ddtrace.contrib.internal.pytest._retry_utils import _get_retry_attempt_string
 from ddtrace.contrib.internal.pytest._retry_utils import set_retry_num
@@ -85,13 +86,14 @@ def efd_handle_retries(
             InternalTest.mark_skip(test_id)
 
     efd_outcome = _efd_do_retries(item)
+    longrepr = InternalTest.stash_get(test_id, "failure_longrepr")
 
-    final_report = pytest_TestReport(
+    final_report = RetryTestReport(
         nodeid=item.nodeid,
         location=item.location,
         keywords=item.keywords,
         when="call",
-        longrepr=None,
+        longrepr=longrepr,
         outcome=_FINAL_OUTCOMES[efd_outcome],
     )
     item.ihook.pytest_runtest_logreport(report=final_report)
diff --git a/ddtrace/contrib/internal/pytest/_plugin_v2.py b/ddtrace/contrib/internal/pytest/_plugin_v2.py
@@ -507,6 +507,9 @@ def _pytest_runtest_makereport(item: pytest.Item, call: pytest_CallInfo, outcome
         # (see <https://github.com/pytest-dev/pytest/blob/8.3.x/src/_pytest/main.py#L654>).
         original_result.outcome = OUTCOME_QUARANTINED
 
+    if original_result.failed or original_result.skipped:
+        InternalTest.stash_set(test_id, "failure_longrepr", original_result.longrepr)
+
     # ATR and EFD retry tests only if their teardown succeeded to ensure the best chance the retry will succeed
     # NOTE: this mutates the original result's outcome
     if InternalTest.stash_get(test_id, "setup_failed") or InternalTest.stash_get(test_id, "teardown_failed"):
diff --git a/ddtrace/contrib/internal/pytest/_retry_utils.py b/ddtrace/contrib/internal/pytest/_retry_utils.py
@@ -8,6 +8,7 @@
 from _pytest.runner import CallInfo
 import pytest
 
+from ddtrace.contrib.internal.pytest._types import pytest_TestReport
 from ddtrace.contrib.internal.pytest._types import tmppath_result_key
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
 from ddtrace.ext.test_visibility.api import TestExcInfo
@@ -128,3 +129,33 @@ def _retry_run_when(item, when, outcomes: RetryOutcomes) -> t.Tuple[CallInfo, _p
     if when == "call" or "passed" not in report.outcome:
         item.ihook.pytest_runtest_logreport(report=report)
     return call, report
+
+
+class RetryTestReport(pytest_TestReport):
+    """
+    A RetryTestReport behaves just like a normal pytest TestReport, except that the the failed/passed/skipped
+    properties are aware of retry final states (dd_efd_final_*, etc). This affects the test counts in JUnit XML output,
+    for instance.
+
+    The object should be initialized with the `longrepr` of the _initial_ test attempt. A `longrepr` set to `None` means
+    the initial attempt either succeeded (which means it was already counted by pytest) or was quarantined (which means
+    we should not count it at all), so we don't need to count it here.
+    """
+
+    @property
+    def failed(self):
+        if self.longrepr is None:
+            return False
+        return "final_failed" in self.outcome
+
+    @property
+    def passed(self):
+        if self.longrepr is None:
+            return False
+        return "final_passed" in self.outcome or "final_flaky" in self.outcome
+
+    @property
+    def skipped(self):
+        if self.longrepr is None:
+            return False
+        return "final_skipped" in self.outcome
diff --git a/releasenotes/notes/ci_visibility-fix-junit-xml-retry-count-65de6ad6b9bb35d2.yaml b/releasenotes/notes/ci_visibility-fix-junit-xml-retry-count-65de6ad6b9bb35d2.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    CI Visibility: This fix resolves an issue where JUnit XML output would not count tests retried by Early Flake
+    Detection, Auto Test Retries, and Attempt-to-Fix.
diff --git a/tests/contrib/pytest/test_pytest_atr.py b/tests/contrib/pytest/test_pytest_atr.py
@@ -7,6 +7,7 @@
 - The session object is patched to never be a faulty session, by default.
 """
 from unittest import mock
+from xml.etree import ElementTree
 
 import pytest
 
@@ -315,3 +316,22 @@ def test_pytest_atr_does_not_retry_failed_setup_or_teardown(self):
 
         assert rec.ret == 1
         assert len(spans) == 5
+
+    def test_pytest_atr_junit_xml(self):
+        self.testdir.makepyfile(test_pass=_TEST_PASS_CONTENT)
+        self.testdir.makepyfile(test_fail=_TEST_FAIL_CONTENT)
+        self.testdir.makepyfile(test_errors=_TEST_ERRORS_CONTENT)
+        self.testdir.makepyfile(test_pass_on_retries=_TEST_PASS_ON_RETRIES_CONTENT)
+        self.testdir.makepyfile(test_skip=_TEST_SKIP_CONTENT)
+
+        rec = self.inline_run("--ddtrace", "--junit-xml=out.xml")
+        assert rec.ret == 1
+
+        test_suite = ElementTree.parse(f"{self.testdir}/out.xml").find("testsuite")
+
+        # There are 15 tests, but we get 16 in the JUnit XML output, because a test that passes during call but fails
+        # during teardown is counted twice. This is a bug in pytest, not ddtrace.
+        assert test_suite.attrib["tests"] == "16"
+        assert test_suite.attrib["failures"] == "4"
+        assert test_suite.attrib["skipped"] == "4"
+        assert test_suite.attrib["errors"] == "2"
diff --git a/tests/contrib/pytest/test_pytest_efd.py b/tests/contrib/pytest/test_pytest_efd.py
@@ -7,6 +7,7 @@
 - The session object is patched to never be a faulty session, by default.
 """
 from unittest import mock
+from xml.etree import ElementTree
 
 import pytest
 
@@ -285,3 +286,17 @@ def test_pytest_efd_does_not_retry_failed_teardown(self):
         assert fails_teardown_spans[0].get_tag("test.is_retry") != "true"
         assert rec.ret == 1
         assert len(spans) == 7
+
+    def test_pytest_efd_junit_xml(self):
+        self.testdir.makepyfile(test_known_pass=_TEST_KNOWN_PASS_CONTENT)
+        self.testdir.makepyfile(test_known_fail=_TEST_KNOWN_FAIL_CONTENT)
+        self.testdir.makepyfile(test_new_pass=_TEST_NEW_PASS_CONTENT)
+        self.testdir.makepyfile(test_new_fail=_TEST_NEW_FAIL_CONTENT)
+        self.testdir.makepyfile(test_new_flaky=_TEST_NEW_FLAKY_CONTENT)
+
+        rec = self.inline_run("--ddtrace", "--junit-xml=out.xml")
+        assert rec.ret == 1
+
+        test_suite = ElementTree.parse(f"{self.testdir}/out.xml").find("testsuite")
+        assert test_suite.attrib["tests"] == "7"
+        assert test_suite.attrib["failures"] == "3"

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +fixes:
 +  - |
 +    CI Visibility: This fix resolves an issue where JUnit XML output would not count tests retried by Early Flake
 +    Detection, Auto Test Retries, and Attempt-to-Fix.