Support multiple batches (N > 1) in Arm runtime

AdrianLundell · AdrianLundell · commit f481b124265c · 2024-12-18T11:29:47.000+01:00
Extends testing of multiple batches on FVP for multiple operators. Note that currently not all operators are supported on compiler level.

Change-Id: I59aee4b80fd058931e02806a83ca639533e7c76b
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -241,9 +241,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             event_tracer,
             "+ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
         // permuted byte copy CHW to HWC
-        permute_CHW_to_HWC(
+        permute_NCHW_to_NHWC(
             tensor_in.mutable_data_ptr<char>(),
             scratch_addr,
+            tensor_in.size(0),
             tensor_in.size(1),
             tensor_in.size(2),
             tensor_in.size(3));
@@ -342,9 +343,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             "+ArmBackend::execute()handles.output.permute_HWC_to_CHW()");
 
         char* output_address = (char*)output_addr;
-        permute_HWC_to_CHW(
+        permute_NHWC_to_NCHW(
             output_address,
             tensor_out.mutable_data_ptr<char>(),
+            tensor_out.size(0),
             tensor_out.size(1),
             tensor_out.size(2),
             tensor_out.size(3));
@@ -420,21 +422,53 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
     return Error::Ok;
   }
 
-  void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W)
-      const {
-    for (int i = 0; i != H * W; ++i) {
-      for (int j = 0; j < C; ++j) {
-        output[i * C + j] = input[i + j * W * H];
+  void permute_NCHW_to_NHWC(
+      const char* input,
+      char* output,
+      const int N,
+      const int C,
+      const int H,
+      const int W) const {
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        for (int i = 0; i < H * W; i++) {
+          *output = *input;
+          // Next element
+          input++;
+          output += C;
+        }
+        // Rewind output and increment to next channel
+        output -= (H * W * C);
+        output++;
       }
+      // Rewind output and increment to next batch
+      output -= C;
+      output += (H * W * C);
     }
   }
 
-  void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W)
-      const {
-    for (int i = 0; i != H * W; ++i) {
-      for (int j = 0; j < C; ++j) {
-        output[i + j * W * H] = input[i * C + j];
+  void permute_NHWC_to_NCHW(
+      const char* input,
+      char* output,
+      const int N,
+      const int C,
+      const int H,
+      const int W) const {
+    for (int n = 0; n < N; n++) {
+      for (int i = 0; i < H * W; i++) {
+        for (int c = 0; c < C; c++) {
+          *output = *input;
+          // Next channel
+          input++;
+          output += H * W;
+        }
+        // Rewind output and increment to next element
+        output -= (H * W * C);
+        output++;
       }
+      // Rewind output and increment to next batch
+      output -= H * W;
+      output += (H * W * C);
     }
   }
 };
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
@@ -25,8 +25,8 @@ class Add(torch.nn.Module):
             (torch.FloatTensor([1, 2, 3, 5, 7]),),
             (3 * torch.ones(8),),
             (10 * torch.randn(8),),
-            (torch.ones(1, 1, 4, 4),),
-            (torch.ones(1, 3, 4, 2),),
+            (torch.ones(2, 1, 4, 4),),
+            (torch.ones(2, 3, 4, 2),),
         ]
 
         def forward(self, x):
@@ -38,10 +38,10 @@ class Add2(torch.nn.Module):
                 torch.FloatTensor([1, 2, 3, 5, 7]),
                 (torch.FloatTensor([2, 1, 2, 1, 10])),
             ),
-            (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
-            (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
-            (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+            (torch.ones(2, 10, 4, 6), torch.ones(2, 10, 4, 6)),
+            (torch.randn(2, 3, 4, 4), torch.randn(2, 3, 4, 4)),
+            (torch.randn(2, 1, 4, 4), torch.ones(2, 1, 4, 1)),
+            (10000 * torch.randn(2, 1, 4, 4), torch.randn(2, 1, 4, 1)),
         ]
 
         def __init__(self):
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
@@ -29,6 +29,11 @@
     ("randn", torch.randn(1, 16, 50, 32), [4, 2, 0]),
 ]
 
+test_data_suite_mult_batches = [
+    # (test_name, test_data, [kernel_size, stride, padding])
+    ("rand", torch.rand(2, 16, 50, 32), [4, 2, 0]),
+]
+
 
 class TestAvgPool2d(unittest.TestCase):
     """Tests AvgPool2d."""
@@ -168,3 +173,31 @@ def test_avgpool2d_tosa_u85_BI(
             common.get_u85_compile_spec(permute_memory_to_nhwc=True),
             (test_data,),
         )
+
+    @parameterized.expand(test_data_suite_mult_batches)
+    @conftest.expectedFailureOnFVP  # See MLTORCH-517
+    def test_avgpool2d_tosa_u55_BI_mult_batches(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params: int | Tuple[int, int],
+    ):
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
+        )
+
+    @parameterized.expand(test_data_suite_mult_batches)
+    @conftest.expectedFailureOnFVP  # See MLTORCH-517
+    def test_avgpool2d_tosa_u85_BI_mult_batches(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params: int | Tuple[int, int],
+    ):
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
+        )
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
@@ -22,7 +22,7 @@
     # (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var,] )
     (
         "zeros_affineT_runStatsT_default_weight_bias_mean_var",
-        torch.zeros(1, 32, 112, 112),
+        torch.zeros(2, 32, 112, 112),
         [
             32,
             True,
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
@@ -161,10 +161,10 @@
     ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
     ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
     ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
+    ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
 ]
 
 testsuite_conv2d_u85_xfails = [
-    ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
@@ -32,18 +32,18 @@
         torch.ones(5) * (-1),
         None,
     ),
-    (
-        "op_div_rank1_rand",
-        torch.rand(5) * 5,
-        torch.rand(5) * 5,
-        None,
-    ),
     (
         "op_div_rank4_ones",
         torch.ones(5, 10, 25, 20),
         torch.ones(5, 10, 25, 20),
         None,
     ),
+    (
+        "op_div_rank1_rand",
+        torch.rand(5) * 5,
+        torch.rand(5) * 5,
+        None,
+    ),
     (
         "op_div_rank4_negative_ones",
         (-1) * torch.ones(5, 10, 25, 20),
@@ -183,7 +183,7 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
-    @parameterized.expand(test_data_suite[:2])
+    @parameterized.expand(test_data_suite[:3])
     def test_div_u55_BI(
         self,
         test_name: str,
@@ -197,7 +197,7 @@ def test_div_u55_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[2:])
+    @parameterized.expand(test_data_suite[3:])
     @conftest.expectedFailureOnFVP
     def test_div_u55_BI_xfails(
         self,
@@ -211,7 +211,7 @@ def test_div_u55_BI_xfails(
             self.Div(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(test_data_suite[:2])
+    @parameterized.expand(test_data_suite[:3])
     def test_div_u85_BI(
         self,
         test_name: str,
@@ -225,7 +225,7 @@ def test_div_u85_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[2:])
+    @parameterized.expand(test_data_suite[3:])
     @conftest.expectedFailureOnFVP
     def test_div_u85_BI_xfails(
         self,
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
@@ -17,10 +17,10 @@
 
 test_data_suite = [
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("zeros", torch.zeros(2, 10, 10, 10)),
     ("ones", torch.ones(10, 10, 10)),
     ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(1, 4, 4, 4) + 10),
+    ("randn_pos", torch.randn(2, 4, 4, 4) + 10),
     ("randn_neg", torch.randn(10) - 10),
     ("ramp", torch.arange(-16, 16, 0.2)),
 ]
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
@@ -23,11 +23,11 @@
 
 test_data_suite = [
     # (test_name, test_data)
-    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("zeros", torch.zeros(2, 10, 10, 10)),
     ("ones", torch.ones(10, 10, 10)),
     ("rand", torch.rand(10, 10) - 0.5),
     ("randn_pos", torch.randn(10) + 10),
-    ("randn_neg", torch.randn(10) - 10),
+    ("randn_neg", torch.randn(2, 10, 10, 10) - 10),
     ("ramp", torch.arange(-16, 16, 0.2)),
 ]
 
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
@@ -16,8 +16,8 @@
 
 test_data_suite = [
     # (test_name, test_data, [normalized_shape, eps, elementwise_affine, has_bias] )
-    ("randn_last_dim", torch.randn(1, 5, 5, 5), [[5]]),
-    ("rand_last_two_dims", torch.rand(1, 5, 5, 5), [[5, 5]]),
+    ("randn_last_dim", torch.randn(2, 5, 5, 5), [[5]]),
+    ("rand_last_two_dims", torch.rand(2, 5, 5, 5), [[5, 5]]),
     (
         "rand_last_two_dims_not_elementwise_affine",
         torch.rand(1, 5, 5, 5),
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
@@ -17,11 +17,11 @@
 
 test_data_suite = [
     # (test_name, test_data)
-    ("ones_rank4", torch.ones(1, 10, 10, 10)),
+    ("ones_rank4", torch.ones(2, 10, 10, 10)),
     ("ones_rank3", torch.ones(10, 10, 10)),
     ("rand", torch.rand(10, 10) + 0.001),
     ("randn_pos", torch.randn(10) + 10),
-    ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)),
+    ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(2, 10, 10, 10) * 100)),
     ("ramp", torch.arange(0.01, 20, 0.2)),
 ]
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
@@ -32,23 +32,23 @@
     ),
     (
         "op_mul_rank4_randn",
-        torch.randn(1, 10, 25, 20),
-        torch.randn(1, 10, 25, 20),
+        torch.randn(5, 10, 25, 20),
+        torch.randn(5, 10, 25, 20),
     ),
     (
         "op_mul_rank4_ones_mul_negative",
-        torch.ones(1, 10, 25, 20),
-        (-1) * torch.ones(1, 10, 25, 20),
+        torch.ones(5, 10, 25, 20),
+        (-1) * torch.ones(5, 10, 25, 20),
     ),
     (
         "op_mul_rank4_negative_large_rand",
-        (-200) * torch.rand(1, 10, 25, 20),
-        torch.rand(1, 1, 1, 20),
+        (-200) * torch.rand(5, 10, 25, 20),
+        torch.rand(5, 1, 1, 20),
     ),
     (
         "op_mul_rank4_large_randn",
-        200 * torch.randn(1, 10, 25, 20),
-        torch.rand(1, 10, 25, 1),
+        200 * torch.randn(5, 10, 25, 20),
+        torch.rand(5, 10, 25, 1),
     ),
 ]
 
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
@@ -145,20 +145,10 @@ def test_permute_u55_BI(
             self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite[:-2])
+    @parameterized.expand(test_data_suite)
     def test_permute_u85_BI(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
         self._test_permute_ethos_BI_pipeline(
             self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
         )
-
-    # Fails since on FVP since N > 1 is not supported. MLETORCH-517
-    @parameterized.expand(test_data_suite[-2:])
-    @conftest.expectedFailureOnFVP
-    def test_permute_u85_BI_xfails(
-        self, test_name: str, test_data: torch.Tensor, dims: list[int]
-    ):
-        self._test_permute_ethos_BI_pipeline(
-            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
-        )
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
@@ -22,12 +22,12 @@
         torch.rand(5) * 5,
     ),
     ("op_reciprocal_rank1_negative_ones", torch.ones(5) * (-1)),
-    ("op_reciprocal_rank4_ones", torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(1, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(1, 10, 25, 20) + 1),
+    ("op_reciprocal_rank4_ones", torch.ones(5, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(5, 10, 25, 20)),
+    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(5, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(5, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(5, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(5, 10, 25, 20) + 1),
 ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`# (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var,] )`
`23`	`23`	`(`
`24`	`24`	`"zeros_affineT_runStatsT_default_weight_bias_mean_var",`
`25`		`- torch.zeros(1, 32, 112, 112),`
	`25`	`+ torch.zeros(2, 32, 112, 112),`
`26`	`26`	`[`
`27`	`27`	`32,`
`28`	`28`	`True,`
Original file line number	Diff line number	Diff line change
`@@ -161,10 +161,10 @@`
`161`	`161`	`("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),`
`162`	`162`	`("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),`
`163`	`163`	`("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),`
	`164`	`+ ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),`
`164`	`165`	`]`
`165`	`166`
`166`	`167`	`testsuite_conv2d_u85_xfails = [`
`167`		`- ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),`
`168`	`168`	`("two_dw_conv2d", two_dw_conv2d),`
`169`	`169`	`]`
`170`	`170`