diff --git a/.ci/scripts/test_backend_linux.sh b/.ci/scripts/test_backend_linux.sh
index 243602fea21..d230860875d 100755
--- a/.ci/scripts/test_backend_linux.sh
+++ b/.ci/scripts/test_backend_linux.sh
@@ -39,12 +39,17 @@ if [[ "$FLOW" == *qnn* ]]; then
 fi
 
 if [[ "$FLOW" == *vulkan* ]]; then
-    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate.
     source .ci/scripts/setup-vulkan-linux-deps.sh
 
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *arm* ]]; then
+    # Setup ARM deps.
+    .ci/scripts/setup-arm-baremetal-tools.sh
+fi
+
 # We need the runner to test the built library.
 PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
 
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
new file mode 100644
index 00000000000..e57be2704a2
--- /dev/null
+++ b/.github/workflows/test-backend-arm.yml
@@ -0,0 +1,27 @@
+name: Test ARM Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-arm.yml
+      - .github/workflows/_test_backend.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-arm:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: arm
+      flows: '["arm_tosa"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index bb249644c47..8bf72827549 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -57,6 +57,7 @@
 
 from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
 
+from executorch.backends.test.harness.error_statistics import ErrorStatistics
 from executorch.backends.test.harness.stages import Stage, StageType
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
@@ -333,6 +334,7 @@ def to_edge_transform_and_lower(
         transform_passes: Optional[
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
         ] = None,
+        generate_etrecord: bool = False,
     ):
         if transform_passes is not None:
             raise RuntimeError(
@@ -367,7 +369,9 @@ def to_edge_transform_and_lower(
                 to_edge_and_lower_stage.partitioners = partitioners
             if edge_compile_config is not None:
                 to_edge_and_lower_stage.edge_compile_conf = edge_compile_config
-        return super().to_edge_transform_and_lower(to_edge_and_lower_stage)
+        return super().to_edge_transform_and_lower(
+            to_edge_and_lower_stage, generate_etrecord=generate_etrecord
+        )
 
     def to_executorch(self, to_executorch_stage: Optional[ToExecutorch] | None = None):
         if to_executorch_stage is None:
@@ -402,6 +406,7 @@ def run_method_and_compare_outputs(
         qtol=0,
         error_callbacks=None,
         run_eager_mode=False,
+        statistics_callback: Callable[[ErrorStatistics], None] | None = None,
     ):
         """
         Compares the run_artifact output of 'stage' with the output of a reference stage.
@@ -657,10 +662,17 @@ def _compare_outputs(
         rtol=1e-03,
         qtol=0,
         error_callbacks=None,
+        statistics_callback: Callable[[ErrorStatistics], None] | None = None,
     ):
         try:
             super()._compare_outputs(
-                reference_output, stage_output, quantization_scale, atol, rtol, qtol
+                reference_output,
+                stage_output,
+                quantization_scale,
+                atol,
+                rtol,
+                qtol,
+                statistics_callback=statistics_callback,
             )
         except AssertionError as e:
             if error_callbacks is None:
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index 9df3805444a..a4b34fee98d 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -38,6 +38,9 @@ class TestFlow:
     skip_patterns: list[str] = field(default_factory=lambda: [])
     """ Tests with names containing any substrings in this list are skipped. """
 
+    supports_serialize: bool = True
+    """ True if the test flow supports the Serialize stage. """
+
     def should_skip_test(self, test_name: str) -> bool:
         return any(pattern in test_name for pattern in self.skip_patterns)
 
@@ -115,4 +118,13 @@ def all_flows() -> dict[str, TestFlow]:
     except Exception as e:
         logger.info(f"Skipping QNN flow registration: {e}")
 
+    try:
+        from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+
+        flows += [
+            ARM_TOSA_FLOW,
+        ]
+    except Exception as e:
+        logger.info(f"Skipping ARM flow registration: {e}")
+
     return {f.name: f for f in flows if f is not None}
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
new file mode 100644
index 00000000000..baa2df79de9
--- /dev/null
+++ b/backends/test/suite/flows/arm.py
@@ -0,0 +1,24 @@
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.test.suite.flow import TestFlow
+
+
+def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
+    kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+
+    return ArmTester(
+        *args,
+        **kwargs,
+    )
+
+
+def _create_tosa_flow() -> TestFlow:
+    return TestFlow(
+        "arm_tosa",
+        backend="arm",
+        tester_factory=_create_arm_tester_tosa_fp,
+        supports_serialize=False,
+    )
+
+
+ARM_TOSA_FLOW = _create_tosa_flow()
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
index 1f84db9c730..eeea09e0fc1 100644
--- a/backends/test/suite/runner.py
+++ b/backends/test/suite/runner.py
@@ -15,6 +15,7 @@
 UNSUPPORTED_PORTABLE_OPS = {
     "aten::_embedding_bag",
     "aten::_adaptive_avg_pool2d",
+    "aten::adaptive_max_pool2d",
     "aten::median",
     "aten::median.dim",
     "aten::round.decimals",
@@ -34,6 +35,7 @@
     TestResult,
 )
 from executorch.exir import EdgeProgramManager
+from executorch.exir.dialects._ops import ops as exir_ops
 
 
 # A list of all runnable test suites and the corresponding python package.
@@ -43,6 +45,24 @@
 }
 
 
+def _graph_has_unsupported_patterns(program: torch.export.ExportedProgram) -> bool:
+    # Returns true if the model contains patterns that will fail when running on the ET
+    # portable kernel library.
+
+    # Check for 3d convolutions. All convs (1d, 2d, 3d) use the same op, so we need to look at
+    # the input meta to determine the rank.
+    for node in program.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target == exir_ops.edge.aten.convolution.default
+        ):
+            in_rank = node.args[0].meta["val"].dim()
+            if in_rank != 4:
+                return True
+
+    return False
+
+
 def _get_test_seed(test_base_name: str) -> int:
     # Set the seed based on the test base name to give consistent inputs between backends. Add the
     # run seed to allow for reproducible results, but still allow for run-to-run variation.
@@ -162,7 +182,7 @@ def build_result(
     # Check if any undelegated ops are in the unsupported ops set.
     has_unsupported_ops = any(
         op in UNSUPPORTED_PORTABLE_OPS for op in undelegated_op_counts.keys()
-    )
+    ) or _graph_has_unsupported_patterns(edge_manager._etrecord.edge_dialect_program)
 
     # Skip the test if there are unsupported portable ops remaining.
     if has_unsupported_ops:
@@ -171,8 +191,11 @@ def build_result(
     # Only run the runtime portion if something was delegated (or the flow doesn't delegate)
     if is_delegated or not flow.is_delegated:
         try:
-            tester.to_executorch().serialize()
-            extra_stats["pte_size_bytes"] = len(tester.get_artifact())
+            tester.to_executorch()
+
+            if flow.supports_serialize:
+                tester.serialize()
+                extra_stats["pte_size_bytes"] = len(tester.get_artifact())
         except Exception as e:
             # We could introduce a result value for this, but I'm not sure it's necessary.
             # We can do this if we ever see to_executorch() or serialize() fail due a backend issue.