From b58a26d38264bcec1d657a6fa76333a31f485732 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <adrian.lundell@arm.com>
Date: Tue, 7 Jan 2025 12:01:44 +0100
Subject: [PATCH 1/2] Align handling of flaky arm unittests

- Removes uses of torch_manual_seed which previously fixed the random state.
- Adds pytest plugin pytest-rerunfailures to mark flaky tests.
- Refactors flaky tests to use data generators in favor of pregenerated data, which ensures that data is randomized between reruns.
- Updates layer_norm testcase to use same qtol value for TOSA/EthosU targets.

Note that fixing the randomness may lead to that we will see more flakyness in CI, this will have to be adressed with the flaky mark on a case by case basis over time.

Change-Id: I15aa8b517bec2a748b93b0d74e09e2f48df40926
---
 backends/arm/test/ops/test_bmm.py            | 101 ++++++++++---------
 backends/arm/test/ops/test_conv1d.py         |  16 +--
 backends/arm/test/ops/test_conv2d.py         |  21 ++--
 backends/arm/test/ops/test_conv_combos.py    |   3 +-
 backends/arm/test/ops/test_depthwise_conv.py |   2 +-
 backends/arm/test/ops/test_layer_norm.py     |   2 +-
 backends/arm/test/ops/test_logsoftmax.py     |  83 +++++++--------
 backends/arm/test/ops/test_mm.py             |  81 ++++++++-------
 backends/arm/test/ops/test_softmax.py        |  83 +++++++--------
 pyproject.toml                               |   1 +
 10 files changed, 190 insertions(+), 203 deletions(-)

diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 06470d91e82..abab6cd1e8a 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
 
 import pytest
 
@@ -16,39 +16,37 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-torch.manual_seed(1)
-
 
 class TestBMM(unittest.TestCase):
     """Tests Batch MatMul"""
 
     class BMM(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
-            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
-            (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
-            (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
-            (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
+        test_data_generators = [
+            lambda: (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+            lambda: (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
+            lambda: (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
+            lambda: (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
+            lambda: (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
         ]
 
         def forward(self, x, y):
             return torch.bmm(x, y)
 
     class MatMul(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
-            (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
+        test_data_generators = [
+            lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
+            lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
         ]
 
         def forward(self, x, y):
             return torch.matmul(x, y)
 
     class BMMSingleInput(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(20, 3, 3),),
-            (torch.rand(2, 128, 128),),
-            (10000 * torch.randn(4, 25, 25),),
-            (5 + 5 * torch.randn(3, 64, 64),),
+        test_data_generators = [
+            lambda: (torch.rand(20, 3, 3),),
+            lambda: (torch.rand(2, 128, 128),),
+            lambda: (10000 * torch.randn(4, 25, 25),),
+            lambda: (5 + 5 * torch.randn(3, 64, 64),),
         ]
 
         def forward(self, x):
@@ -120,67 +118,74 @@ def _test_bmm_ethosu_BI_pipeline(
         if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
-    @parameterized.expand(BMM.test_parameters)
-    def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(BMM.test_data_generators)
+    def test_bmm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data)
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
-    def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
+    def test_bmm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
 
-    @parameterized.expand(MatMul.test_parameters)
-    def test_matmul_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MatMul.test_data_generators)
+    def test_matmul_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.MatMul(), test_data)
 
-    @parameterized.expand(MatMul.test_parameters)
-    def test_matmul_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MatMul.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_matmul_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.MatMul(), test_data)
 
-    @parameterized.expand(BMM.test_parameters)
-    def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(BMM.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_bmm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
-    def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
 
-    @parameterized.expand(BMM.test_parameters)
+    @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
-    def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @conftest.expectedFailureOnFVP
+    def test_bmm_u55_BI_xfails(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMM.test_parameters)
+    @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_bmm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
-    @parameterized.expand(BMMSingleInput.test_parameters)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
     @unittest.expectedFailure
-    def test_bmm_single_input_u55_BI_xfails(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    def test_bmm_single_input_u55_BI_xfails(
+        self, test_data_generator: Callable[[], Tuple]
+    ):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
-    def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_bmm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u85_compile_spec(), test_data
         )
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 3e0dfa6c5c4..92da09a5ef3 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import pytest
 
@@ -25,7 +25,6 @@ class Conv1d(torch.nn.Module):
 
     def __init__(
         self,
-        inputs: Optional[torch.Tensor] = None,
         length=8,
         nbr_conv=1,  # Number of chained convs
         in_channels: Union[List, int, None] = None,
@@ -75,11 +74,10 @@ def __init__(
         if not isinstance(padding_mode, List):
             padding_mode = [padding_mode]
 
-        # Generate test data if not provided
-        if inputs is None:
-            self.inputs = (torch.randn(batches, in_channels[0], length).to(dtype),)
-        else:
-            self.inputs = (inputs,)
+        self.batches = batches
+        self.in_channels = in_channels
+        self.length = length
+        self.dtype = dtype
 
         # Build chain of convs
         for i in range(self.nbr_convs):
@@ -100,7 +98,9 @@ def __init__(
             )
 
     def get_inputs(self):
-        return self.inputs
+        return (
+            torch.randn(self.batches, self.in_channels[0], self.length).to(self.dtype),
+        )
 
     def forward(self, x):
         for i in range(self.nbr_convs):
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index b80228c6f25..16ebe7fb25e 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import pytest
 
@@ -25,7 +25,6 @@ class Conv2d(torch.nn.Module):
 
     def __init__(
         self,
-        inputs: Optional[torch.Tensor] = None,
         height=8,
         width=8,
         nbr_conv=1,  # Number of chained convs
@@ -76,13 +75,11 @@ def __init__(
         if not isinstance(padding_mode, List):
             padding_mode = [padding_mode]
 
-        # Generate test data if not provided
-        if inputs is None:
-            self.inputs = (
-                torch.randn(batches, in_channels[0], height, width).to(dtype),
-            )
-        else:
-            self.inputs = (inputs,)
+        self.batches = batches
+        self.in_channels = in_channels
+        self.height = height
+        self.width = width
+        self.dtype = dtype
 
         # Build chain of convs
         for i in range(self.nbr_convs):
@@ -103,7 +100,11 @@ def __init__(
             )
 
     def get_inputs(self):
-        return self.inputs
+        return (
+            torch.randn(self.batches, self.in_channels[0], self.height, self.width).to(
+                self.dtype
+            ),
+        )
 
     def forward(self, x):
         for i in range(self.nbr_convs):
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 8352727a1c3..ae3f28b3709 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -353,8 +353,7 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO: Investigate flakyness (MLTORCH-307)
-    @unittest.skip(reason="Skiped due to flakyness (MLTORCH-307)")
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 22d9798aeaf..c4fa9810dc0 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -252,8 +252,8 @@ def _test_dw_conv_ethos_BI_pipeline(
     def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
     def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index c287f51ebcc..82f0af8dcf7 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -109,7 +109,7 @@ def _test_layernorm_tosa_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
+            .run_method_and_compare_outputs(qtol=1, inputs=test_data)
         )
 
     def _test_layernorm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index d1581423a0a..f34d4afbb55 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -6,7 +6,9 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
+
+import pytest
 
 import torch
 from executorch.backends.arm.test import common
@@ -15,27 +17,27 @@
 from parameterized import parameterized
 
 
-test_data_suite = [
+test_data_generators = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
-test_data_suite_u55 = [
+test_data_generators_u55 = [
     # (test_name, test_data, dim)
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
 ]
 
 
@@ -128,42 +130,29 @@ def _test_logsoftmax_tosa_u85_BI_pipeline(
             common.get_u85_compile_spec(), module, test_data
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    def test_logsoftmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_MI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite_u55)
-    def test_logsoftmax_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators_u55)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_u55_BI_pipeline(
             self.LogSoftmax(dim=dim), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_u85_BI_pipeline(
             self.LogSoftmax(dim=dim), (test_data,)
         )
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 5fa28076aac..ba5b0eb1b86 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -7,8 +7,9 @@
 import logging
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -18,30 +19,28 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-torch.manual_seed(0)
-
 
 class TestMM(unittest.TestCase):
     """Tests MatMul"""
 
     class MM(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(3, 5), torch.rand(5, 2)),
-            (torch.rand(1, 1), torch.rand(1, 1)),
-            (torch.ones(55, 3), torch.ones(3, 44)),
-            (10000 * torch.randn(1, 10), torch.randn(10, 5)),
-            (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
+        test_data_generators = [
+            lambda: (torch.rand(3, 5), torch.rand(5, 2)),
+            lambda: (torch.rand(1, 1), torch.rand(1, 1)),
+            lambda: (torch.ones(55, 3), torch.ones(3, 44)),
+            lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
+            lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
         ]
 
         def forward(self, x, y):
             return torch.mm(x, y)
 
     class MMSingleInput(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(3, 3),),
-            (torch.ones(128, 128),),
-            (10000 * torch.randn(25, 25),),
-            (5 + 5 * torch.randn(64, 64),),
+        test_data_generators = [
+            lambda: (torch.rand(3, 3),),
+            lambda: (torch.ones(128, 128),),
+            lambda: (10000 * torch.randn(25, 25),),
+            lambda: (5 + 5 * torch.randn(64, 64),),
         ]
 
         def forward(self, x):
@@ -110,54 +109,58 @@ def _test_mm_ethosu_BI_pipeline(
             .to_executorch()
         )
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    def test_mm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_MI_pipeline(self.MM(), test_data)
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_tosa_MI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    def test_mm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_MI_pipeline(self.MMSingleInput(), test_data)
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_mm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_BI_pipeline(self.MM(), test_data)
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_mm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_BI_pipeline(self.MMSingleInput(), test_data)
 
     # Expected to fail with error: CPU performance estimation for "MatMul" not implemented
-    @parameterized.expand(MM.test_parameters)
+    @parameterized.expand(MM.test_data_generators)
     @unittest.expectedFailure
-    def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    def test_mm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.MM(), test_data
         )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
-    @parameterized.expand(MMSingleInput.test_parameters)
+    @parameterized.expand(MMSingleInput.test_data_generators)
     @unittest.expectedFailure
-    def test_mm_single_input_u55_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    def test_mm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.MMSingleInput(), test_data
         )
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_mm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.MM(), test_data
         )
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_u85_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_mm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.MMSingleInput(), test_data
         )
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 794f6b791f7..c60da18594f 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -7,7 +7,9 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
+
+import pytest
 
 import torch
 from executorch.backends.arm.test import common
@@ -16,28 +18,28 @@
 from parameterized import parameterized
 
 
-test_data_suite = [
+test_data_generators = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
 
-test_data_suite_u55 = [
+test_data_generators_u55 = [
     # (test_name, test_data, dim)
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
 ]
 
 
@@ -130,38 +132,25 @@ def _test_softmax_tosa_u85_BI_pipeline(
             common.get_u85_compile_spec(), module, test_data
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    def test_softmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_MI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite_u55)
-    def test_softmax_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators_u55)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
diff --git a/pyproject.toml b/pyproject.toml
index 11673f9c960..c860e1b2c79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,7 @@ dependencies=[
   "parameterized",
   "pytest",
   "pytest-xdist",
+  "pytest-rerunfailures",
   "pyyaml",
   "ruamel.yaml",
   "sympy",

From c358cec15e30b56929b86b20452fb76d60ad5239 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <adrian.lundell@arm.com>
Date: Mon, 20 Jan 2025 10:20:47 +0100
Subject: [PATCH 2/2] Fix expectedFailure

---
 backends/arm/test/ops/test_bmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index abab6cd1e8a..ed1a551ef33 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -153,7 +153,7 @@ def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]
 
     @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
+    @unittest.expectedFailure
     def test_bmm_u55_BI_xfails(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(