[Backend Tester] Add initial reporting skeleton

GregoryComer · GregoryComer · commit fbe335b76358 · 2025-07-14T11:48:40.000-07:00
diff --git a/backends/test/suite/__init__.py b/backends/test/suite/__init__.py
@@ -16,6 +16,9 @@
 
 import torch
 from executorch.backends.test.harness import Tester
+from executorch.backends.test.suite.context import get_active_test_context, TestContext
+from executorch.backends.test.suite.reporting import log_test_summary
+from executorch.backends.test.suite.runner import run_test, runner_main
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -60,17 +63,17 @@ def is_backend_enabled(backend):
 
 
 DTYPES = [
-    torch.int8,
-    torch.uint8,
-    torch.int16,
-    torch.uint16,
-    torch.int32,
-    torch.uint32,
-    torch.int64,
-    torch.uint64,
-    torch.float16,
+    # torch.int8,
+    # torch.uint8,
+    # torch.int16,
+    # torch.uint16,
+    # torch.int32,
+    # torch.uint32,
+    # torch.int64,
+    # torch.uint64,
+    # torch.float16,
     torch.float32,
-    torch.float64,
+    # torch.float64,
 ]
 
 FLOAT_DTYPES = [
@@ -117,16 +120,19 @@ def _expand_test(cls, test_name: str):
     delattr(cls, test_name)
 
 
-def _make_wrapped_test(test_func, *args, **kwargs):
+def _make_wrapped_test(
+    test_func: Callable,
+    test_name: str,
+    test_flow: str,
+    tester_factory: Callable,
+    params: dict | None = None,
+):
     def wrapped_test(self):
-        test_func(self, *args, **kwargs)
+        with TestContext(test_name, test_flow, params):
+            test_kwargs = params or {}
+            test_kwargs["tester_factory"] = tester_factory
 
-    return wrapped_test
-
-
-def _make_wrapped_dtype_test(test_func, dtype, tester_factory):
-    def wrapped_test(self):
-        test_func(self, dtype, tester_factory)
+            test_func(self, **test_kwargs)
 
     return wrapped_test
 
@@ -140,37 +146,63 @@ def _create_test_for_backend(
     test_type = getattr(test_func, "test_type", TestType.STANDARD)
 
     if test_type == TestType.STANDARD:
-        wrapped_test = _make_wrapped_test(test_func, tester_factory)
+        wrapped_test = _make_wrapped_test(
+            test_func, test_func.__name__, flow_name, tester_factory
+        )
         test_name = f"{test_func.__name__}_{flow_name}"
         setattr(cls, test_name, wrapped_test)
     elif test_type == TestType.DTYPE:
         for dtype in DTYPES:
-            # wrapped_test = _make_wrapped_dtype_test(test_func, dtype, tester_factory)
-            wrapped_test = _make_wrapped_test(test_func, dtype, tester_factory)
+            wrapped_test = _make_wrapped_test(
+                test_func,
+                test_func.__name__,
+                flow_name,
+                tester_factory,
+                {"dtype": dtype},
+            )
             dtype_name = str(dtype)[6:]  # strip "torch."
             test_name = f"{test_func.__name__}_{dtype_name}_{flow_name}"
             setattr(cls, test_name, wrapped_test)
     else:
         raise NotImplementedError(f"Unknown test type {test_type}.")
 
 
+def load_tests(loader, suite, pattern):
+    package_dir = os.path.dirname(__file__)
+    discovered_suite = loader.discover(
+        start_dir=package_dir, pattern=pattern or "test_*.py"
+    )
+    suite.addTests(discovered_suite)
+    return suite
+
+
 class OperatorTest(unittest.TestCase):
     def _test_op(self, model, inputs, tester_factory):
-        tester = (
-            tester_factory(
-                model,
-                inputs,
-            )
-            .export()
-            .to_edge_transform_and_lower()
+        context = get_active_test_context()
+
+        # This should be set in the wrapped test. See _make_wrapped_test above.
+        assert context is not None, "Missing test context."
+
+        run_summary = run_test(
+            model,
+            inputs,
+            tester_factory,
+            context.test_name,
+            context.flow_name,
+            context.params,
         )
 
-        is_delegated = any(
-            n.target == torch._higher_order_ops.executorch_call_delegate
-            for n in tester.stages[tester.cur].graph_module.graph.nodes
-            if n.op == "call_function"
-        )
+        log_test_summary(run_summary)
+
+        if not run_summary.result.is_success():
+            if run_summary.result.is_backend_failure():
+                raise RuntimeError("Test failure.") from run_summary.error
+            else:
+                # Non-backend failure indicates a bad test. Mark as skipped.
+                raise unittest.SkipTest(
+                    f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
+                )
+
 
-        # Only run the runtime test if the op was delegated.
-        if is_delegated:
-            (tester.to_executorch().serialize().run_method_and_compare_outputs())
+if __name__ == "__main__":
+    runner_main()
diff --git a/backends/test/suite/context.py b/backends/test/suite/context.py
@@ -0,0 +1,28 @@
+# Test run context management. This is used to determine the test context for reporting
+# purposes.
+class TestContext:
+    def __init__(self, test_name: str, flow_name: str, params: dict | None):
+        self.test_name = test_name
+        self.flow_name = flow_name
+        self.params = params
+
+    def __enter__(self):
+        global _active_test_context
+        import sys
+
+        if _active_test_context is not None:
+            print(f"Active context: {_active_test_context.test_name}", file=sys.stderr)
+        assert _active_test_context is None
+        _active_test_context = self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        global _active_test_context
+        _active_test_context = None
+
+
+_active_test_context: TestContext | None = None
+
+
+def get_active_test_context() -> TestContext | None:
+    global _active_test_context
+    return _active_test_context
diff --git a/backends/test/suite/reporting.py b/backends/test/suite/reporting.py
@@ -0,0 +1,163 @@
+from collections import Counter
+from dataclasses import dataclass
+from enum import IntEnum, nonmember
+
+
+class TestResult(IntEnum):
+    """Represents the result of a test case run, indicating success or a specific failure reason."""
+
+    SUCCESS = 0
+    """ The test succeeded with the backend delegate part or all of the graph. """
+
+    SUCCESS_UNDELEGATED = 1
+    """ The test succeeded without the backend delegating anything. """
+
+    EAGER_FAIL = 2
+    """ The test failed due to the model failing to run in eager mode. """
+
+    EXPORT_FAIL = 3
+    """ The test failed due to the model failing to export. """
+
+    LOWER_FAIL = 4
+    """ The test failed due to a failure in partitioning or lowering. """
+
+    PTE_LOAD_FAIL = 5
+    """ The test failed due to the resulting PTE failing to load. """
+
+    PTE_RUN_FAIL = 6
+    """ The test failed due to the resulting PTE failing to run. """
+
+    OUTPUT_MISMATCH_FAIL = 7
+    """ The test failed due to a mismatch between runtime and reference outputs. """
+
+    UNKNOWN_FAIL = 8
+    """ The test failed in an unknown or unexpected manner. """
+
+    @nonmember
+    def is_success(self):
+        return self in {TestResult.SUCCESS, TestResult.SUCCESS_UNDELEGATED}
+
+    @nonmember
+    def is_non_backend_failure(self):
+        return self in {TestResult.EAGER_FAIL, TestResult.EAGER_FAIL}
+
+    @nonmember
+    def is_backend_failure(self):
+        return not self.is_success() and not self.is_non_backend_failure()
+
+    @nonmember
+    def display_name(self):
+        if self == TestResult.SUCCESS:
+            return "Success (Delegated)"
+        elif self == TestResult.SUCCESS_UNDELEGATED:
+            return "Success (Undelegated)"
+        elif self == TestResult.EAGER_FAIL:
+            return "Fail (Eager)"
+        elif self == TestResult.EXPORT_FAIL:
+            return "Fail (Export)"
+        elif self == TestResult.LOWER_FAIL:
+            return "Fail (Lowering)"
+        elif self == TestResult.PTE_LOAD_FAIL:
+            return "Fail (PTE Load)"
+        elif self == TestResult.PTE_RUN_FAIL:
+            return "Fail (PTE Run)"
+        elif self == TestResult.OUTPUT_MISMATCH_FAIL:
+            return "Fail (Output Mismatch)"
+        elif self == TestResult.UNKNOWN_FAIL:
+            return "Fail (Other)"
+        else:
+            raise ValueError(f"Invalid TestResult value: {self}.")
+
+
+@dataclass
+class TestCaseSummary:
+    """
+    Contains summary results for the execution of a single test case.
+    """
+
+    name: str
+    """ The qualified name of the test, not including the flow suffix. """
+
+    flow: str
+    """ The backend-specific flow name. Corresponds to flows registered in backends/test/suite/__init__.py. """
+
+    params: dict | None
+    """ Test-specific parameters, such as dtype. """
+
+    result: TestResult
+    """ The top-level result, such as SUCCESS or LOWER_FAIL. """
+
+    error: Exception | None
+    """ The Python exception object, if any. """
+
+
+class TestSessionState:
+    test_case_summaries: list[TestCaseSummary]
+
+    def __init__(self):
+        self.test_case_summaries = []
+
+
+@dataclass
+class RunSummary:
+    aggregated_results: dict[TestResult, int]
+    num_test_cases: int
+    test_case_summaries: list[TestCaseSummary]
+    total_failed: int
+    total_passed: int
+    total_skipped: int
+
+    @staticmethod
+    def from_session(session: TestSessionState) -> "RunSummary":
+        # Total each outcome type.
+        aggregated_results = dict(
+            sorted(Counter(s.result for s in session.test_case_summaries).items())
+        )
+
+        total_failed = 0
+        total_passed = 0
+        total_skipped = 0
+
+        for k, v in aggregated_results.items():
+            if k.is_success():
+                total_passed += v
+            elif k.is_backend_failure():
+                total_failed += v
+            else:
+                total_skipped += v
+
+        return RunSummary(
+            aggregated_results=aggregated_results,
+            num_test_cases=len(session.test_case_summaries),
+            test_case_summaries=session.test_case_summaries,
+            total_failed=total_failed,
+            total_passed=total_passed,
+            total_skipped=total_skipped,
+        )
+
+
+_active_session: TestSessionState | None = None
+
+
+def begin_test_session():
+    global _active_session
+
+    assert _active_session is None, "A test session is already active."
+    _active_session = TestSessionState()
+
+
+def log_test_summary(summary: TestCaseSummary):
+    global _active_session
+
+    if _active_session is not None:
+        _active_session.test_case_summaries.append(summary)
+
+
+def complete_test_session() -> RunSummary:
+    global _active_session
+
+    assert _active_session is not None, "No test session is active."
+    summary = RunSummary.from_session(_active_session)
+    _active_session = None
+
+    return summary
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py