[Backend Tester] Add subtest index field

GregoryComer · GregoryComer · commit f67c9ec40874 · 2025-08-11T22:27:54.000-07:00
ghstack-source-id: ed7cbf3 ghstack-comment-id: 3177697272 Pull-Request: #13311
diff --git a/backends/test/suite/context.py b/backends/test/suite/context.py
@@ -1,13 +1,16 @@
 # Test run context management. This is used to determine the test context for reporting
 # purposes.
 class TestContext:
+    subtest_index: int
+
     def __init__(
         self, test_name: str, test_base_name: str, flow_name: str, params: dict | None
     ):
         self.test_name = test_name
         self.test_base_name = test_base_name
         self.flow_name = flow_name
         self.params = params
+        self.subtest_index = 0
 
     def __enter__(self):
         global _active_test_context
diff --git a/backends/test/suite/operators/__init__.py b/backends/test/suite/operators/__init__.py
@@ -152,12 +152,16 @@ def _test_op(
             flow,
             context.test_name,
             context.test_base_name,
+            context.subtest_index,
             context.params,
             generate_random_test_inputs=generate_random_test_inputs,
         )
 
         log_test_summary(run_summary)
 
+        # This is reset when a new test is started - it creates the context per-test.
+        context.subtest_index = context.subtest_index + 1
+
         if not run_summary.result.is_success():
             if run_summary.result.is_backend_failure():
                 raise RuntimeError("Test failure.") from run_summary.error
diff --git a/backends/test/suite/reporting.py b/backends/test/suite/reporting.py
@@ -11,6 +11,41 @@
 from torch.export import ExportedProgram
 
 
+# The maximum number of model output tensors to log statistics for. Most model tests will
+# only have one output, but some may return more than one tensor. This upper bound is needed
+# upfront since the file is written progressively. Any outputs beyond these will not have stats logged.
+MAX_LOGGED_MODEL_OUTPUTS = 2
+
+
+# Field names for the CSV report.
+CSV_FIELD_NAMES = [
+    "Test ID",
+    "Test Case",
+    "Subtest",
+    "Flow",
+    "Params",
+    "Result",
+    "Result Detail",
+    "Delegated",
+    "Quantize Time (s)",
+    "Lower Time (s)",
+    "Delegated Nodes",
+    "Undelegated Nodes",
+    "Delegated Ops",
+    "Undelegated Ops",
+    "PTE Size (Kb)",
+]
+
+for i in range(MAX_LOGGED_MODEL_OUTPUTS):
+    CSV_FIELD_NAMES.extend(
+        [
+            f"Output {i} Error Max",
+            f"Output {i} Error MAE",
+            f"Output {i} SNR",
+        ]
+    )
+
+
 # Operators that are excluded from the counts returned by count_ops. These are used to
 # exclude operatations that are not logically relevant or delegatable to backends.
 OP_COUNT_IGNORED_OPS = {
@@ -129,6 +164,9 @@ class TestCaseSummary:
     name: str
     """ The full name of test, including flow and parameter suffixes. """
 
+    subtest_index: int
+    """ The subtest number. If a test case runs multiple tests, this field can be used to disambiguate. """
+
     params: dict | None
     """ Test-specific parameters, such as dtype. """
 
@@ -305,14 +343,22 @@ def generate_csv_report(summary: RunSummary, output: TextIO):
         "Lower Time (s)",
     ]
 
-    # Tests can have custom parameters. We'll want to report them here, so we need
-    # a list of all unique parameter names.
-    param_names = reduce(
-        lambda a, b: a.union(b),
-        (
-            set(s.params.keys())
-            for s in summary.test_case_summaries
-            if s.params is not None
+def write_csv_row(record: TestCaseSummary, output: TextIO):
+    writer = csv.DictWriter(output, CSV_FIELD_NAMES)
+
+    row = {
+        "Test ID": record.name,
+        "Test Case": record.base_name,
+        "Subtest": record.subtest_index,
+        "Flow": record.flow,
+        "Params": _serialize_params(record.params),
+        "Result": record.result.to_short_str(),
+        "Result Detail": record.result.to_detail_str(),
+        "Delegated": "True" if record.is_delegated() else "False",
+        "Quantize Time (s)": (
+            f"{record.quantize_time.total_seconds():.3f}"
+            if record.quantize_time
+            else None
         ),
         set(),
     )
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
@@ -46,6 +46,7 @@ def run_test(  # noqa: C901
     flow: TestFlow,
     test_name: str,
     test_base_name: str,
+    subtest_index: int,
     params: dict | None,
     dynamic_shapes: Any | None = None,
     generate_random_test_inputs: bool = True,
@@ -65,6 +66,7 @@ def build_result(
         return TestCaseSummary(
             backend=flow.backend,
             base_name=test_base_name,
+            subtest_index=subtest_index,
             flow=flow.name,
             name=test_name,
             params=params,