pytorch
diff --git a/‎backends/test/suite/flow.py‎
Lines changed: 19 additions & 1 deletion b/‎backends/test/suite/flow.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎backends/test/suite/flows/arm.py‎
Lines changed: 24 additions & 0 deletions b/‎backends/test/suite/flows/arm.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/test/suite/flows/coreml.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/test/suite/flows/coreml.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/test/suite/flows/vulkan.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/test/suite/flows/vulkan.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/test/suite/generate_markdown_summary.py‎
Lines changed: 171 additions & 46 deletions b/‎backends/test/suite/generate_markdown_summary.py‎
Lines changed: 171 additions & 46 deletions
diff --git a/‎backends/test/suite/models/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/test/suite/models/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/test/suite/operators/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/test/suite/operators/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/test/suite/operators/test_abs.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/test/suite/operators/test_abs.py‎
Lines changed: 3 additions & 0 deletions
@@ -1,6 +1,6 @@
 import logging
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Callable
 
 from executorch.backends.test.harness import Tester
@@ -35,6 +35,15 @@ class TestFlow:
     is_delegated: bool = True
     """ Indicates whether the flow is expected to generate CALL_DELEGATE nodes. """
 
+    skip_patterns: list[str] = field(default_factory=lambda: [])
+    """ Tests with names containing any substrings in this list are skipped. """
+
+    supports_serialize: bool = True
+    """ True if the test flow supports the Serialize stage. """
+
+    def should_skip_test(self, test_name: str) -> bool:
+        return any(pattern in test_name for pattern in self.skip_patterns)
+
 
 def all_flows() -> dict[str, TestFlow]:
     flows = []
@@ -109,4 +118,13 @@ def all_flows() -> dict[str, TestFlow]:
     except Exception as e:
         logger.info(f"Skipping QNN flow registration: {e}")
 
+    try:
+        from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+
+        flows += [
+            ARM_TOSA_FLOW,
+        ]
+    except Exception as e:
+        logger.info(f"Skipping ARM flow registration: {e}")
+
     return {f.name: f for f in flows if f is not None}
@@ -0,0 +1,24 @@
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.test.suite.flow import TestFlow
+
+
+def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
+    kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+
+    return ArmTester(
+        *args,
+        **kwargs,
+    )
+
+
+def _create_tosa_flow() -> TestFlow:
+    return TestFlow(
+        "arm_tosa",
+        backend="arm",
+        tester_factory=_create_arm_tester_tosa_fp,
+        supports_serialize=False,
+    )
+
+
+ARM_TOSA_FLOW = _create_tosa_flow()
@@ -19,6 +19,7 @@ def _create_coreml_flow(
             CoreMLTester, minimum_deployment_target=minimum_deployment_target
         ),
         quantize=quantize,
+        skip_patterns=["test_argmin", "test_argmax"],
     )
 
 
 
@@ -20,6 +20,7 @@ def _create_vulkan_flow_base(
         tester_factory=VulkanTester,
         quantize=quantize_stage_factory is not None,
         quantize_stage_factory=quantize_stage_factory,
+        skip_patterns=["float16", "float64"],  # Not supported in swiftshader
     )
 
 
 
@@ -1,7 +1,58 @@
 import argparse
 import csv
+import json
 import sys
 
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ResultCounts:
+    """
+    Represents aggregated result counts for each status.
+    """
+
+    total: int = 0
+    passes: int = 0
+    fails: int = 0
+    skips: int = 0
+    by_detail: dict[str, int] = field(default_factory=lambda: {})
+
+    def add_row(self, result_value: str, result_detail: str) -> None:
+        """
+        Update the result counts for the specified row.
+        """
+
+        self.total += 1
+
+        if result_value == "Pass":
+            self.passes += 1
+        elif result_value == "Fail":
+            self.fails += 1
+        elif result_value == "Skip":
+            self.skips += 1
+        else:
+            raise RuntimeError(f"Unknown result value {result_value}")
+
+        if result_detail:
+            if result_detail not in self.by_detail:
+                self.by_detail[result_detail] = 0
+
+            self.by_detail[result_detail] += 1
+
+
+@dataclass
+class AggregatedSummary:
+    """
+    Represents aggegrated summary data for the test run.
+    """
+
+    counts: ResultCounts
+    counts_by_params: dict[str, ResultCounts]
+    failed_tests: list[list[str]]
+    header: list[str]
+
+
 #
 # A standalone script to generate a Markdown representation of a test report.
 # This is primarily intended to be used with GitHub actions to generate a nice
@@ -12,93 +63,167 @@
 #
 
 
-def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
-    # Print warning if exit code is non-zero
-    if exit_code != 0:
-        print("> [!WARNING]")
-        print(
-            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
-        )
-
+def aggregate_results(csv_path: str) -> AggregatedSummary:
     with open(csv_path, newline="", encoding="utf-8") as f:
         reader = csv.reader(f)
         rows = list(reader)
 
     header = rows[0]
     data_rows = rows[1:]
 
-    # Find the Result and Result Detail column indices
-    result_column_index = None
-    result_detail_column_index = None
-    for i, col in enumerate(header):
-        if col.lower() == "result":
-            result_column_index = i
-        elif col.lower() == "result detail":
-            result_detail_column_index = i
+    header_indices_by_name = {n.lower(): i for (i, n) in enumerate(header)}
+    params_column_index = header_indices_by_name.get("params", None)
+    result_column_index = header_indices_by_name["result"]
+    result_detail_column_index = header_indices_by_name["result detail"]
 
     # Count results and prepare data
-    pass_count = 0
-    fail_count = 0
-    skip_count = 0
+    counts = ResultCounts()
     failed_tests = []
-    processed_rows = []
-    result_detail_counts = {}
+    counts_by_param = {}
 
     for row in data_rows:
+        result = row[result_column_index]
+        result_detail = row[result_detail_column_index]
+
+        counts.add_row(result, result_detail)
+
+        params = row[params_column_index] if params_column_index else None
+        if params:
+            if params not in counts_by_param:
+                counts_by_param[params] = ResultCounts()
+            counts_by_param[params].add_row(result, result_detail)
+
         # Make a copy of the row to avoid modifying the original
-        processed_row = row.copy()
+        processed_row = [escape_for_markdown(cell) for cell in row]
 
         # Count results and collect failed tests
         if result_column_index is not None and result_column_index < len(row):
             result_value = row[result_column_index].strip().lower()
             if result_value == "pass":
-                pass_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:green">Pass</span>'
                 )
             elif result_value == "fail":
-                fail_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:red">Fail</span>'
                 )
                 failed_tests.append(processed_row.copy())
             elif result_value == "skip":
-                skip_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:gray">Skip</span>'
                 )
 
-        # Count result details (excluding empty ones)
-        if result_detail_column_index is not None and result_detail_column_index < len(
-            row
-        ):
-            result_detail_value = row[result_detail_column_index].strip()
-            if result_detail_value:  # Only count non-empty result details
-                if result_detail_value in result_detail_counts:
-                    result_detail_counts[result_detail_value] += 1
-                else:
-                    result_detail_counts[result_detail_value] = 1
+    return AggregatedSummary(
+        counts=counts,
+        failed_tests=failed_tests,
+        counts_by_params=counts_by_param,
+        header=header,
+    )
+
+
+def escape_for_markdown(text: str) -> str:
+    """
+    Modify a string to properly display in a markdown table cell.
+    """
+    if not text:
+        return text
 
-        processed_rows.append(processed_row)
+    # Replace newlines with <br /> tags
+    escaped = text.replace("\n", "<br />")
+
+    # Escape backslashes.
+    escaped = escaped.replace("\\", "\\\\")
+
+    # Escape pipe characters that would break table structure
+    escaped = escaped.replace("|", "\\|")
+
+    return escaped
+
+
+def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
+    # Print warning if exit code is non-zero
+    if exit_code != 0:
+        print("> [!WARNING]")
+        print(
+            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
+        )
+
+    results = aggregate_results(csv_path)
 
     # Generate Summary section
-    total_rows = len(data_rows)
     print("# Summary\n")
-    print(f"- **Pass**: {pass_count}/{total_rows}")
-    print(f"- **Fail**: {fail_count}/{total_rows}")
-    print(f"- **Skip**: {skip_count}/{total_rows}")
+    total_excluding_skips = results.counts.passes + results.counts.fails
+    pass_fraction = results.counts.passes / total_excluding_skips
+    fail_fraction = results.counts.fails / total_excluding_skips
+    print(
+        f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)"
+    )
+    print(
+        f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)"
+    )
+    print(f"- **Skip**: {results.counts.skips}")
+
+    if results.counts_by_params:
+        print("\n## Results by Parameters\n")
+
+        # Extract all unique parameter keys from the JSON strings
+        all_param_keys = set()
+        parsed_params = {}
+
+        for params_str in results.counts_by_params.keys():
+            # Parse the JSON string (it's a string representation of a dict)
+            params_dict = json.loads(params_str)
+            parsed_params[params_str] = params_dict
+            all_param_keys.update(params_dict.keys())
+
+        if parsed_params and len(parsed_params) > 1:
+            # Sort parameter keys for consistent column ordering
+            sorted_param_keys = sorted(all_param_keys)
+
+            # Create table header
+            header_cols = sorted_param_keys + ["Pass", "Fail", "Skip", "Pass %"]
+            print("| " + " | ".join(header_cols) + " |")
+            print("|" + "|".join(["---"] * len(header_cols)) + "|")
+
+            # Create table rows
+            for params_str, counts in results.counts_by_params.items():
+                if params_str in parsed_params:
+                    params_dict = parsed_params[params_str]
+                    row_values = []
+
+                    # Add parameter values
+                    for key in sorted_param_keys:
+                        value = params_dict.get(key, "")
+                        row_values.append(str(value))
+
+                    pass_fraction = counts.passes / (counts.passes + counts.fails)
+
+                    # Add count values
+                    row_values.extend(
+                        [
+                            str(counts.passes),
+                            str(counts.fails),
+                            str(counts.skips),
+                            f"{pass_fraction*100:.2f}%",
+                        ]
+                    )
+
+                    print("| " + " | ".join(row_values) + " |")
+
+        print()
 
     print("## Failure Breakdown:")
-    total_rows_with_result_detail = sum(result_detail_counts.values())
-    for detail, count in sorted(result_detail_counts.items()):
+    total_rows_with_result_detail = sum(results.counts.by_detail.values())
+    for detail, count in sorted(results.counts.by_detail.items()):
         print(f"- **{detail}**: {count}/{total_rows_with_result_detail}")
 
     # Generate Failed Tests section
     print("# Failed Tests\n")
-    if failed_tests:
-        print("| " + " | ".join(header) + " |")
-        print("|" + "|".join(["---"] * len(header)) + "|")
-        for row in failed_tests:
+    if results.failed_tests:
+        escaped_header = [escape_for_markdown(col) for col in results.header]
+        print("| " + " | ".join(escaped_header) + " |")
+        print("|" + "|".join(["---"] * len(results.header)) + "|")
+        for row in results.failed_tests:
             print("| " + " | ".join(row) + " |")
     else:
         print("No failed tests.\n")
 
@@ -52,6 +52,11 @@ def wrapped_test(self):
             "use_dynamic_shapes": use_dynamic_shapes,
         }
         with TestContext(test_name, test_func.__name__, flow.name, params):
+            if flow.should_skip_test(test_name):
+                raise unittest.SkipTest(
+                    f"Skipping test due to matching flow {flow.name} skip patterns"
+                )
+
             test_func(self, flow, dtype, use_dynamic_shapes)
 
     wrapped_test._name = test_func.__name__  # type: ignore
 
@@ -97,6 +97,11 @@ def _make_wrapped_test(
 ):
     def wrapped_test(self):
         with TestContext(test_name, test_base_name, flow.name, params):
+            if flow.should_skip_test(test_name):
+                raise unittest.SkipTest(
+                    f"Skipping test due to matching flow {flow.name} skip patterns"
+                )
+
             test_kwargs = copy.copy(params) or {}
             test_kwargs["flow"] = flow
 
 
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -45,6 +47,7 @@ def test_abs_shapes(self, flow: TestFlow) -> None:
         # 3D tensor
         self._test_op(AbsModel(), (torch.randn(3, 4, 5),), flow)
 
+    @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_abs_edge_cases(self, flow: TestFlow) -> None:
         # Test edge cases
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ def _create_coreml_flow(`
`19`	`19`	`CoreMLTester, minimum_deployment_target=minimum_deployment_target`
`20`	`20`	`),`
`21`	`21`	`quantize=quantize,`
	`22`	`+ skip_patterns=["test_argmin", "test_argmax"],`
`22`	`23`	`)`
`23`	`24`
`24`	`25`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ def _create_vulkan_flow_base(`
`20`	`20`	`tester_factory=VulkanTester,`
`21`	`21`	`quantize=quantize_stage_factory is not None,`
`22`	`22`	`quantize_stage_factory=quantize_stage_factory,`
	`23`	`+ skip_patterns=["float16", "float64"], # Not supported in swiftshader`
`23`	`24`	`)`
`24`	`25`
`25`	`26`