pytorch · GregoryComer · Sep 29, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/.ci/scripts/test_backend_linux.sh → .ci/scripts/test_backend.sh b/.ci/scripts/test_backend_linux.sh → .ci/scripts/test_backend.sh
@@ -10,16 +10,26 @@ SUITE=$1
 FLOW=$2
 ARTIFACT_DIR=$3
 
-REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv"
+REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.json"
 
 echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
-# The generic Linux job chooses to use base env, not the one setup by the image
 eval "$(conda shell.bash hook)"
 CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 conda activate "${CONDA_ENV}"
 
+if [[ "$(uname)" == "Darwin" ]]; then
+    bash .ci/scripts/setup-conda.sh
+    eval "$(conda shell.bash hook)"
+    CONDA_RUN_CMD="${CONDA_RUN} --no-capture-output"
+    ${CONDA_RUN_CMD} pip install awscli==1.37.21
+    IS_MACOS=1
+else
+    CONDA_RUN_CMD=""
+    IS_MACOS=0
+fi
+
 export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
@@ -50,11 +60,14 @@ if [[ "$FLOW" == *arm* ]]; then
     .ci/scripts/setup-arm-baremetal-tools.sh
 fi
 
-# We need the runner to test the built library.
-PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
+if [[ $IS_MACOS -eq 1 ]]; then
+    SETUP_SCRIPT=.ci/scripts/setup-macos.sh
+else
+    SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+fi
+CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
 EXIT_CODE=0
-python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$?
-
+${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
-python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
+${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/.ci/scripts/test_backend_macos.sh b/.ci/scripts/test_backend_macos.sh
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
@@ -57,7 +57,7 @@ jobs:
       script: |
         set -eux
 
-        source .ci/scripts/test_backend_linux.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
 
   test-backend-macos:
     if: ${{ inputs.run-macos }}
@@ -81,4 +81,4 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
 
-        source .ci/scripts/test_backend_macos.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
@@ -11,6 +11,7 @@
 import os
 
 import executorch.backends.test.suite.flow
+import torch
 
 from executorch.backends.test.suite.flow import TestFlow
 from executorch.backends.test.suite.runner import runner_main
@@ -55,6 +56,11 @@ def get_test_flows() -> dict[str, TestFlow]:
     return _ALL_TEST_FLOWS
 
 
+def dtype_to_str(dtype: torch.dtype) -> str:
+    # Strip off "torch."
+    return str(dtype)[6:]
+
+
 def load_tests(loader, suite, pattern):
     package_dir = os.path.dirname(__file__)
     discovered_suite = loader.discover(

@@ -0,0 +1,182 @@
+from typing import Any
+
+import pytest
+import torch
+
+from executorch.backends.test.suite.flow import all_flows
+from executorch.backends.test.suite.reporting import _sum_op_counts
+from executorch.backends.test.suite.runner import run_test
+
+
+def pytest_configure(config):
+    backends = set()
+
+    for flow in all_flows().values():
+        config.addinivalue_line(
+            "markers",
+            f"flow_{flow.name}: mark a test as testing the {flow.name} flow",
+        )
+
+        if flow.backend not in backends:
+            config.addinivalue_line(
+                "markers",
+                f"backend_{flow.backend}: mark a test as testing the {flow.backend} backend",
+            )
+            backends.add(flow.backend)
+
+
+class TestRunner:
+    def __init__(self, flow, test_name, test_base_name):
+        self._flow = flow
+        self._test_name = test_name
+        self._test_base_name = test_base_name
+        self._subtest = 0
+        self._results = []
+
+    def lower_and_run_model(
+        self,
+        model: torch.nn.Module,
+        inputs: Any,
+        generate_random_test_inputs=True,
+        dynamic_shapes=None,
+    ):
+        run_summary = run_test(
+            model,
+            inputs,
+            self._flow,
+            self._test_name,
+            self._test_base_name,
+            self._subtest,
+            None,
+            generate_random_test_inputs=generate_random_test_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
+        self._subtest += 1
+        self._results.append(run_summary)
+
+        if not run_summary.result.is_success():
+            if run_summary.result.is_backend_failure():
+                raise RuntimeError("Test failure.") from run_summary.error
+            else:
+                # Non-backend failure indicates a bad test. Mark as skipped.
+                pytest.skip(
+                    f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
+                )
+
+
+@pytest.fixture(
+    params=[
+        pytest.param(
+            f,
+            marks=[
+                getattr(pytest.mark, f"flow_{f.name}"),
+                getattr(pytest.mark, f"backend_{f.backend}"),
+            ],
+        )
+        for f in all_flows().values()
+    ],
+    ids=str,
+)
+def test_runner(request):
+    return TestRunner(request.param, request.node.name, request.node.originalname)
+
+
+@pytest.hookimpl(optionalhook=True)
+def pytest_json_runtest_metadata(item, call):
+    # Store detailed results in the test report under the metadata key.
+    metadata = {"subtests": []}
+
+    if hasattr(item, "funcargs") and "test_runner" in item.funcargs:
+        runner_instance = item.funcargs["test_runner"]
+
+        for record in runner_instance._results:
+            subtest_metadata = {}
+
+            error_message = ""
+            if record.error is not None:
+                error_str = str(record.error)
+                if len(error_str) > 400:
+                    error_message = error_str[:200] + "..." + error_str[-200:]
+                else:
+                    error_message = error_str
+
+            subtest_metadata["Test ID"] = record.name
+            subtest_metadata["Test Case"] = record.base_name
+            subtest_metadata["Subtest"] = record.subtest_index
+            subtest_metadata["Flow"] = record.flow
+            subtest_metadata["Result"] = record.result.to_short_str()
+            subtest_metadata["Result Detail"] = record.result.to_detail_str()
+            subtest_metadata["Error"] = error_message
+            subtest_metadata["Delegated"] = "True" if record.is_delegated() else "False"
+            subtest_metadata["Quantize Time (s)"] = (
+                f"{record.quantize_time.total_seconds():.3f}"
+                if record.quantize_time
+                else None
+            )
+            subtest_metadata["Lower Time (s)"] = (
+                f"{record.lower_time.total_seconds():.3f}"
+                if record.lower_time
+                else None
+            )
+
+            for output_idx, error_stats in enumerate(record.tensor_error_statistics):
+                subtest_metadata[f"Output {output_idx} Error Max"] = (
+                    f"{error_stats.error_max:.3f}"
+                )
+                subtest_metadata[f"Output {output_idx} Error MAE"] = (
+                    f"{error_stats.error_mae:.3f}"
+                )
+                subtest_metadata[f"Output {output_idx} SNR"] = f"{error_stats.sqnr:.3f}"
+
+            subtest_metadata["Delegated Nodes"] = _sum_op_counts(
+                record.delegated_op_counts
+            )
+            subtest_metadata["Undelegated Nodes"] = _sum_op_counts(
+                record.undelegated_op_counts
+            )
+            if record.delegated_op_counts:
+                subtest_metadata["Delegated Ops"] = dict(record.delegated_op_counts)
+            if record.undelegated_op_counts:
+                subtest_metadata["Undelegated Ops"] = dict(record.undelegated_op_counts)
+            subtest_metadata["PTE Size (Kb)"] = (
+                f"{record.pte_size_bytes / 1000.0:.3f}" if record.pte_size_bytes else ""
+            )
+
+            metadata["subtests"].append(subtest_metadata)
+    return metadata
+
+
+@pytest.hookimpl(optionalhook=True)
+def pytest_json_modifyreport(json_report):
+    # Post-process the report, mainly to populate metadata for crashed tests. The runtest_metadata
+    # hook doesn't seem to be called when there's a native crash, but xdist still creates a report
+    # entry.
+
+    for test_data in json_report["tests"]:
+        if "metadata" not in test_data:
+            test_data["metadata"] = {}
+        metadata = test_data["metadata"]
+        if "subtests" not in metadata:
+            metadata["subtests"] = []
+        subtests = metadata["subtests"]
+
+        # Native crashes are recorded differently and won't have the full metadata.
+        # Pytest-xdist records crash info under the "???" key.
+        if "???" in test_data:
+            test_id = test_data["nodeid"].removeprefix("::")  # Remove leading ::
+            test_base_id = test_id.split("[")[
+                0
+            ]  # Strip parameterization to get the base test case
+            params = test_id[len(test_base_id) + 1 : -1].split("-")
+            flow = params[0]
+
+            crashed_test_meta = {
+                "Test ID": test_id,
+                "Test Case": test_base_id,
+                "Flow": flow,
+                "Result": "Fail",
+                "Result Detail": "Process Crash",
+                "Error": test_data["???"].get("longrepr", "Process crashed."),
+            }
+            subtests.append(crashed_test_meta)
@@ -44,6 +44,9 @@ class TestFlow:
     def should_skip_test(self, test_name: str) -> bool:
         return any(pattern in test_name for pattern in self.skip_patterns)
 
+    def __str__(self):
+        return self.name
+
 
 def all_flows() -> dict[str, TestFlow]:
     flows = []