[Backend Tester] Add pass rate breakdown by parameterization to markdown summary (pytorch#14360)

GregoryComer · StrycekSimon · commit c695c9cc186d · 2025-09-23T19:07:56.000+02:00
Add a table showing pass rate by test parameters. This gives a breakdown
by dtype and dynamic shape on/off for model tests, making it easier to
see the pass rate for f32 + static shapes.

Also, run on release branches.
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-coreml.yml b/.github/workflows/test-backend-coreml.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-qnn.yml b/.github/workflows/test-backend-qnn.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-vulkan.yml b/.github/workflows/test-backend-vulkan.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-xnnpack.yml b/.github/workflows/test-backend-xnnpack.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/backends/test/suite/generate_markdown_summary.py b/backends/test/suite/generate_markdown_summary.py
@@ -1,124 +1,229 @@
 import argparse
 import csv
+import json
 import sys
 
-#
-# A standalone script to generate a Markdown representation of a test report.
-# This is primarily intended to be used with GitHub actions to generate a nice
-# representation of the test results when looking at the action run.
-#
-# Usage: python executorch/backends/test/suite/generate_markdown_summary.py <path to test report CSV file>
-# Markdown is written to stdout.
-#
+from dataclasses import dataclass, field
 
 
-def escape_for_markdown(text: str) -> str:
+@dataclass
+class ResultCounts:
     """
-    Modify a string to properly display in a markdown table cell.
+    Represents aggregated result counts for each status.
     """
-    if not text:
-        return text
 
-    # Replace newlines with <br /> tags
-    escaped = text.replace("\n", "<br />")
+    total: int = 0
+    passes: int = 0
+    fails: int = 0
+    skips: int = 0
+    by_detail: dict[str, int] = field(default_factory=lambda: {})
 
-    # Escape backslashes.
-    escaped = escaped.replace("\\", "\\\\")
+    def add_row(self, result_value: str, result_detail: str) -> None:
+        """
+        Update the result counts for the specified row.
+        """
 
-    # Escape pipe characters that would break table structure
-    escaped = escaped.replace("|", "\\|")
+        self.total += 1
 
-    return escaped
+        if result_value == "Pass":
+            self.passes += 1
+        elif result_value == "Fail":
+            self.fails += 1
+        elif result_value == "Skip":
+            self.skips += 1
+        else:
+            raise RuntimeError(f"Unknown result value {result_value}")
 
+        if result_detail:
+            if result_detail not in self.by_detail:
+                self.by_detail[result_detail] = 0
+
+            self.by_detail[result_detail] += 1
+
+
+@dataclass
+class AggregatedSummary:
+    """
+    Represents aggegrated summary data for the test run.
+    """
+
+    counts: ResultCounts
+    counts_by_params: dict[str, ResultCounts]
+    failed_tests: list[list[str]]
+    header: list[str]
+
+
+#
+# A standalone script to generate a Markdown representation of a test report.
+# This is primarily intended to be used with GitHub actions to generate a nice
+# representation of the test results when looking at the action run.
+#
+# Usage: python executorch/backends/test/suite/generate_markdown_summary.py <path to test report CSV file>
+# Markdown is written to stdout.
+#
 
-def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
-    # Print warning if exit code is non-zero
-    if exit_code != 0:
-        print("> [!WARNING]")
-        print(
-            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
-        )
 
+def aggregate_results(csv_path: str) -> AggregatedSummary:
     with open(csv_path, newline="", encoding="utf-8") as f:
         reader = csv.reader(f)
         rows = list(reader)
 
     header = rows[0]
     data_rows = rows[1:]
 
-    # Find the Result and Result Detail column indices
-    result_column_index = None
-    result_detail_column_index = None
-    for i, col in enumerate(header):
-        if col.lower() == "result":
-            result_column_index = i
-        elif col.lower() == "result detail":
-            result_detail_column_index = i
+    header_indices_by_name = {n.lower(): i for (i, n) in enumerate(header)}
+    params_column_index = header_indices_by_name.get("params", None)
+    result_column_index = header_indices_by_name["result"]
+    result_detail_column_index = header_indices_by_name["result detail"]
 
     # Count results and prepare data
-    pass_count = 0
-    fail_count = 0
-    skip_count = 0
+    counts = ResultCounts()
     failed_tests = []
-    processed_rows = []
-    result_detail_counts = {}
+    counts_by_param = {}
 
     for row in data_rows:
+        result = row[result_column_index]
+        result_detail = row[result_detail_column_index]
+
+        counts.add_row(result, result_detail)
+
+        params = row[params_column_index] if params_column_index else None
+        if params:
+            if params not in counts_by_param:
+                counts_by_param[params] = ResultCounts()
+            counts_by_param[params].add_row(result, result_detail)
+
         # Make a copy of the row to avoid modifying the original
         processed_row = [escape_for_markdown(cell) for cell in row]
 
         # Count results and collect failed tests
         if result_column_index is not None and result_column_index < len(row):
             result_value = row[result_column_index].strip().lower()
             if result_value == "pass":
-                pass_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:green">Pass</span>'
                 )
             elif result_value == "fail":
-                fail_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:red">Fail</span>'
                 )
                 failed_tests.append(processed_row.copy())
             elif result_value == "skip":
-                skip_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:gray">Skip</span>'
                 )
 
-        # Count result details (excluding empty ones)
-        if result_detail_column_index is not None and result_detail_column_index < len(
-            row
-        ):
-            result_detail_value = row[result_detail_column_index].strip()
-            if result_detail_value:  # Only count non-empty result details
-                if result_detail_value in result_detail_counts:
-                    result_detail_counts[result_detail_value] += 1
-                else:
-                    result_detail_counts[result_detail_value] = 1
+    return AggregatedSummary(
+        counts=counts,
+        failed_tests=failed_tests,
+        counts_by_params=counts_by_param,
+        header=header,
+    )
+
+
+def escape_for_markdown(text: str) -> str:
+    """
+    Modify a string to properly display in a markdown table cell.
+    """
+    if not text:
+        return text
+
+    # Replace newlines with <br /> tags
+    escaped = text.replace("\n", "<br />")
 
-        processed_rows.append(processed_row)
+    # Escape backslashes.
+    escaped = escaped.replace("\\", "\\\\")
+
+    # Escape pipe characters that would break table structure
+    escaped = escaped.replace("|", "\\|")
+
+    return escaped
+
+
+def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
+    # Print warning if exit code is non-zero
+    if exit_code != 0:
+        print("> [!WARNING]")
+        print(
+            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
+        )
+
+    results = aggregate_results(csv_path)
 
     # Generate Summary section
-    total_rows = len(data_rows)
     print("# Summary\n")
-    print(f"- **Pass**: {pass_count}/{total_rows}")
-    print(f"- **Fail**: {fail_count}/{total_rows}")
-    print(f"- **Skip**: {skip_count}/{total_rows}")
+    total_excluding_skips = results.counts.passes + results.counts.fails
+    pass_fraction = results.counts.passes / total_excluding_skips
+    fail_fraction = results.counts.fails / total_excluding_skips
+    print(
+        f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)"
+    )
+    print(
+        f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)"
+    )
+    print(f"- **Skip**: {results.counts.skips}")
+
+    if results.counts_by_params:
+        print("\n## Results by Parameters\n")
+
+        # Extract all unique parameter keys from the JSON strings
+        all_param_keys = set()
+        parsed_params = {}
+
+        for params_str in results.counts_by_params.keys():
+            # Parse the JSON string (it's a string representation of a dict)
+            params_dict = json.loads(params_str)
+            parsed_params[params_str] = params_dict
+            all_param_keys.update(params_dict.keys())
+
+        if parsed_params and len(parsed_params) > 1:
+            # Sort parameter keys for consistent column ordering
+            sorted_param_keys = sorted(all_param_keys)
+
+            # Create table header
+            header_cols = sorted_param_keys + ["Pass", "Fail", "Skip", "Pass %"]
+            print("| " + " | ".join(header_cols) + " |")
+            print("|" + "|".join(["---"] * len(header_cols)) + "|")
+
+            # Create table rows
+            for params_str, counts in results.counts_by_params.items():
+                if params_str in parsed_params:
+                    params_dict = parsed_params[params_str]
+                    row_values = []
+
+                    # Add parameter values
+                    for key in sorted_param_keys:
+                        value = params_dict.get(key, "")
+                        row_values.append(str(value))
+
+                    pass_fraction = counts.passes / (counts.passes + counts.fails)
+
+                    # Add count values
+                    row_values.extend(
+                        [
+                            str(counts.passes),
+                            str(counts.fails),
+                            str(counts.skips),
+                            f"{pass_fraction*100:.2f}%",
+                        ]
+                    )
+
+                    print("| " + " | ".join(row_values) + " |")
+
+        print()
 
     print("## Failure Breakdown:")
-    total_rows_with_result_detail = sum(result_detail_counts.values())
-    for detail, count in sorted(result_detail_counts.items()):
+    total_rows_with_result_detail = sum(results.counts.by_detail.values())
+    for detail, count in sorted(results.counts.by_detail.items()):
         print(f"- **{detail}**: {count}/{total_rows_with_result_detail}")
 
     # Generate Failed Tests section
     print("# Failed Tests\n")
-    if failed_tests:
-        escaped_header = [escape_for_markdown(col) for col in header]
+    if results.failed_tests:
+        escaped_header = [escape_for_markdown(col) for col in results.header]
         print("| " + " | ".join(escaped_header) + " |")
-        print("|" + "|".join(["---"] * len(header)) + "|")
-        for row in failed_tests:
+        print("|" + "|".join(["---"] * len(results.header)) + "|")
+        for row in results.failed_tests:
             print("| " + " | ".join(row) + " |")
     else:
         print("No failed tests.\n")
diff --git a/backends/test/suite/reporting.py b/backends/test/suite/reporting.py
@@ -1,4 +1,5 @@
 import csv
+import json
 
 from collections import Counter
 from dataclasses import dataclass, field
@@ -343,7 +344,9 @@ def _sum_op_counts(counter: Counter | None) -> int | None:
 
 def _serialize_params(params: dict[str, Any] | None) -> str:
     if params is not None:
-        return str(dict(sorted(params.items())))
+        # Convert values to strings - JSON conversion doesn't like dtypes.
+        str_params = {k: str(v) for k, v in params.items()}
+        return json.dumps(str_params)
     else:
         return ""
 
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
@@ -57,7 +57,7 @@ def _graph_has_unsupported_patterns(program: torch.export.ExportedProgram) -> bo
             and node.target == exir_ops.edge.aten.convolution.default
         ):
             in_rank = node.args[0].meta["val"].dim()
-            if in_rank != 4:
+            if in_rank > 4:
                 return True
 
     return False
diff --git a/backends/test/suite/tests/test_reporting.py b/backends/test/suite/tests/test_reporting.py
@@ -1,3 +1,4 @@
+import json
 import unittest
 
 from csv import DictReader
@@ -102,14 +103,16 @@ def test_csv_report_simple(self):
         self.assertEqual(records[2]["Test Case"], "test2")
         self.assertEqual(records[2]["Flow"], "flow1")
         self.assertEqual(records[2]["Result"], "Pass")
-        self.assertEqual(records[2]["Params"], str({"dtype": torch.float32}))
+        self.assertEqual(records[2]["Params"], json.dumps({"dtype": "torch.float32"}))
 
         # Validate fourth record: test2, backend2, EXPORT_FAIL with use_dynamic_shapes param
         self.assertEqual(records[3]["Test ID"], "test2_backend2_flow1")
         self.assertEqual(records[3]["Test Case"], "test2")
         self.assertEqual(records[3]["Flow"], "flow1")
         self.assertEqual(records[3]["Result"], "Skip")
-        self.assertEqual(records[3]["Params"], str({"use_dynamic_shapes": True}))
+        self.assertEqual(
+            records[3]["Params"], json.dumps({"use_dynamic_shapes": "True"})
+        )
 
     def test_count_ops(self):
         """