Arm backend: Update VelaIO handling

oscarandersson8218 · oscarandersson8218 · commit df556e6be1dd · 2025-08-15T07:47:33.000+02:00
VelaIO is always 6D.
- Update AOT handling of metadata from Vela.
- Adds unittest to trigger 5D cases.
- Updates EthosUBackend to read IO as 6D arrays.

Signed-off-by: Oscar Andersson &lt;oscar.andersson@arm.com&gt;
Change-Id: I8d7d3a44ac84e5bb14fa27e7b7765c3b7a8ee483
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
@@ -25,17 +25,19 @@
 # per-io structs to simplify runtime use.
 def vela_bin_pack_io(prefix, data):
     vela_input_shapes = data[prefix + "_shape"]
+    # Vela input/output shape is fixed to 6D
+    vela_io_shape_dims = 6
 
     ios = struct.pack("<i", len(vela_input_shapes))
     for i in range(len(vela_input_shapes)):
         io_shape = vela_input_shapes[i]
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
-        assert len(io_shape) <= 4
-        inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
+        assert len(io_shape) == vela_io_shape_dims
+        inp_pad = io_shape.tolist()
         io_struct = struct.pack(
-            "<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
+            "<iiiiiiiii", *inp_pad, io_elem_size, io_offset, io_region
         )
         ios += io_struct
     return ios
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
@@ -279,12 +279,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             event_tracer,
             "+EthosUBackend::execute()handles.input.permute_CHW_to_HWC()");
         // permuted byte copy CHW to HWC
+        int c, h, w;
+        ET_CHECK_OK_OR_RETURN_ERROR(get_chw(tensor_in, &c, &h, &w));
+
         permute_CHW_to_HWC(
-            tensor_in.mutable_data_ptr<char>(),
-            scratch_addr,
-            tensor_in.size(1),
-            tensor_in.size(2),
-            tensor_in.size(3));
+            tensor_in.mutable_data_ptr<char>(), scratch_addr, c, h, w);
       } else if (both_char || both_int || both_short || both_bool) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
@@ -381,13 +380,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             "+EthosUBackend::execute()handles.output.permute_HWC_to_CHW()");
 
         const char* output_address = static_cast<const char*>(output_addr);
+        int c, h, w;
+        ET_CHECK_OK_OR_RETURN_ERROR(get_chw(tensor_out, &c, &h, &w));
 
         permute_HWC_to_CHW(
-            output_address,
-            tensor_out.mutable_data_ptr<char>(),
-            tensor_out.size(1),
-            tensor_out.size(2),
-            tensor_out.size(3));
+            output_address, tensor_out.mutable_data_ptr<char>(), c, h, w);
       } else {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
@@ -421,8 +418,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       *tensor_count = *tensor_count * tensor.size(i);
     }
 
-    // The VelaIO type has a shape of fixed size 4
-    for (int i = 0; i < 4; i++) {
+    // The VelaIO type has a shape of fixed size 6
+    for (int i = 0; i < shapeDim; i++) {
       *io_count = *io_count * io->shape[i];
     }
   }
@@ -438,17 +435,46 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       // special case for NHWC workaround in AOT; as the compilation has
       // permuted to channel last in an undetectable way, we assume here
       // that the application has similarly permuted any input/output tensors.
-      permuted_shape = tensor.size(0) == io->shape[0] &&
-          tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] &&
-          tensor.size(3) == io->shape[2];
+      permuted_shape =
+          tensor.size(0) == io->shape[0] * io->shape[1] * io->shape[2] &&
+          tensor.size(1) == io->shape[5] && tensor.size(2) == io->shape[3] &&
+          tensor.size(3) == io->shape[4];
       if (permuted_shape) {
-        ET_LOG(Debug, "Tensor input/output %d will be permuted", index);
+        ET_LOG(Debug, "4D tensor input/output %d will be permuted", index);
+      }
+    } else if (tensor.dim() == 5) {
+      // tensor has format NNCHW, but the VelaIO is in NNNHWC
+      permuted_shape = io->shape[0] == 1 && tensor.size(0) == io->shape[1] &&
+          tensor.size(1) == io->shape[2] && tensor.size(2) == io->shape[5] &&
+          tensor.size(3) == io->shape[3] && tensor.size(4) == io->shape[4];
+      if (permuted_shape) {
+        ET_LOG(Debug, "5D tensor input/output %d will be permuted", index);
       }
     }
     *is_permuted = permuted_shape;
     return Error::Ok;
   }
 
+  Error get_chw(const executorch::aten::Tensor tensor, int* c, int* h, int* w)
+      const {
+    if (tensor.dim() == 4) {
+      *c = tensor.size(1);
+      *h = tensor.size(2);
+      *w = tensor.size(3);
+    } else if (tensor.dim() == 5) {
+      *c = tensor.size(2);
+      *h = tensor.size(3);
+      *w = tensor.size(4);
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported output tensor dimension %d, expected 4 or 5",
+          tensor.dim());
+      return Error::InvalidProgram;
+    }
+    return Error::Ok;
+  }
+
   void permute_CHW_to_HWC(const char* input, char* output, int C, int H, int W)
       const {
     for (int i = 0; i != H * W; ++i) {
diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 Arm Limited and/or its affiliates.
+ * Copyright 2023-2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -34,9 +34,11 @@ typedef struct {
   char data[]; // block.name specific format data
 } VelaBinBlock;
 
+constexpr int shapeDim = 6; // Number of dimensions in VelaIO
+
 // A Vela input or output descriptor in the binary stream
 typedef struct {
-  int shape[4]; // Up to 4D shape of input or output
+  int shape[shapeDim]; // Shape of input or output
   int elem_size; // Element sizeof in bytes
   int offset; // Offset in bytes within SRAM working data
   int region; // Scratch region this belongs to
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
@@ -29,6 +29,7 @@ class SqueezeDim(torch.nn.Module):
         "squeeze3d_dim_neg_2": lambda: (torch.randn(1, 1, 5), -2),
         "squeeze4d_dim_pos_3": lambda: (torch.randn(1, 2, 3, 1), 3),
         "squeeze4d_dim_neg_2": lambda: (torch.randn(1, 5, 1, 5), -2),
+        "squeeze5d_dim_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), -2),
     }
 
     def forward(self, x: torch.Tensor, dim: int):
@@ -40,6 +41,7 @@ class SqueezeDims(torch.nn.Module):
         "squeeze3d_dims_0_1": lambda: (torch.randn(1, 1, 5), (0, 1)),
         "squeeze4d_dims_0_neg_1": lambda: (torch.randn(1, 5, 5, 1), (0, -1)),
         "squeeze4d_dims_0_neg_2": lambda: (torch.randn(1, 5, 1, 5), (0, -2)),
+        "squeeze5d_dims_0_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), (0, -2)),
     }
 
     def forward(self, x: torch.Tensor, dims: tuple[int]):
@@ -51,6 +53,7 @@ class Squeeze(torch.nn.Module):
         "squeeze3d": lambda: (torch.randn(1, 1, 5),),
         "squeeze4d_dims": lambda: (torch.randn(1, 5, 5, 1),),
         "squeeze3d_dims_mix": lambda: (torch.randn(1, 5, 1, 5),),
+        "squeeze4d_dims_mix": lambda: (torch.randn(1, 1, 5, 1, 5),),
     }
 
     def forward(self, x: torch.Tensor):
diff --git a/backends/arm/test/ops/test_unflatten.py b/backends/arm/test/ops/test_unflatten.py
@@ -9,6 +9,8 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
     TosaPipelineFP,
     TosaPipelineINT,
     VgfPipeline,
@@ -30,8 +32,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.unflatten(x, self.dim, self.sizes)
 
     test_data: dict[str, test_data_t] = {
-        "randn_4d": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))),
-        "rand_3d": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))),
+        "rand_3d_batch3": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))),
+        "rand_3d_batch1": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(1, 4, 4),))),
+        "randn_4d_dim1": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))),
+        "randn_4d_dim3": (lambda: (Unflatten(3, (2, 2)), (torch.randn(1, 1, 5, 4),))),
     }
 
 
@@ -49,7 +53,33 @@ def test_unflatten_int_tosa_FP(test_data: test_data_t):
 @common.parametrize("test_data", Unflatten.test_data)
 def test_unflatten_int_tosa_INT(test_data: test_data_t):
     module, inputs = test_data()
-    pipeline = TosaPipelineINT[input_t](
+    pipeline = TosaPipelineINT[input_t](module, inputs, Unflatten.aten_op)
+    pipeline.run()
+
+
+xfails = {
+    "rand_3d_batch3": "Batch size > 1 currently not supported for FVP tests",
+    "randn_4d_dim1": "Batch size > 1 currently not supported for FVP tests",
+}
+
+
+@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False)
+@common.XfailIfNoCorstone300
+def test_unflatten_int_u55_INT(test_data: test_data_t):
+    module, inputs = test_data()
+    pipeline = EthosU55PipelineINT[input_t](
+        module,
+        inputs,
+        Unflatten.aten_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False)
+@common.XfailIfNoCorstone320
+def test_unflatten_int_u85_INT(test_data: test_data_t):
+    module, inputs = test_data()
+    pipeline = EthosU85PipelineINT[input_t](
         module,
         inputs,
         Unflatten.aten_op,
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
@@ -25,7 +25,7 @@
 
 
 class Unsqueeze(torch.nn.Module):
-    shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3)]
+    shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3), (1, 5, 4, 3)]
     test_parameters = {}
     for n in shapes:
         test_parameters[f"rand_{n}"] = (torch.randn(n),)
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
@@ -44,6 +44,10 @@ class View(torch.nn.Module):
         "rand_4d_4_3": lambda: (torch.rand(5, 10, 1, 1), (1, 25, 2)),
         "rand_4d_4_2": lambda: (torch.rand(2, 50, 1, 1), (1, 100)),
         "rand_4d_2_4_same": lambda: (torch.rand(2, 3, 2, 3), (2, 3, 3, 2)),
+        "rand_4d_5d": lambda: (torch.rand(1, 3, 4, 5), (1, 1, 4, 5, -1)),
+        "rand_5d_5d": lambda: (torch.rand(1, 1, 4, 5, 6), (1, 1, 4, -1, 6)),
+        "rand_5d_3d": lambda: (torch.rand(1, 1, 4, 5, 6), (2, 3, -1)),
+        "rand_3d_5d": lambda: (torch.rand(4, 5, 6), (1, 1, 2, -1, 3)),
     }
 
     rank_product_too_large = {
@@ -97,7 +101,9 @@ def test_view_tosa_INT(test_data: Tuple):
 }
 
 
-@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails)
+@common.parametrize(
+    "test_data", View.needs_transpose_tests, xfails=xfails, strict=False
+)
 @common.XfailIfNoCorstone300
 def test_view_u55_INT(test_data: Tuple):
     test_tensor, new_shape = test_data()
@@ -151,7 +157,9 @@ def test_view_u55_INT_not_delegated(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails)
+@common.parametrize(
+    "test_data", View.needs_transpose_tests, xfails=xfails, strict=False
+)
 @common.XfailIfNoCorstone320
 def test_view_u85_INT(test_data: Tuple):
     test_tensor, new_shape = test_data()
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
@@ -60,7 +60,7 @@ fi
 
 # Vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
-vela_rev="d37febc1715edf0d236c2ff555739a8a9aadcf9a"
+vela_rev="9a43a1bf26bfc7588358d7e6e6bb2613b4981a34"
 
 # MLSDK dependencies
 mlsdk_manifest_dir="ml-sdk-for-vulkan-manifest"

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ class SqueezeDim(torch.nn.Module):`
`29`	`29`	`"squeeze3d_dim_neg_2": lambda: (torch.randn(1, 1, 5), -2),`
`30`	`30`	`"squeeze4d_dim_pos_3": lambda: (torch.randn(1, 2, 3, 1), 3),`
`31`	`31`	`"squeeze4d_dim_neg_2": lambda: (torch.randn(1, 5, 1, 5), -2),`
	`32`	`+ "squeeze5d_dim_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), -2),`
`32`	`33`	`}`
`33`	`34`
`34`	`35`	`def forward(self, x: torch.Tensor, dim: int):`
`@@ -40,6 +41,7 @@ class SqueezeDims(torch.nn.Module):`
`40`	`41`	`"squeeze3d_dims_0_1": lambda: (torch.randn(1, 1, 5), (0, 1)),`
`41`	`42`	`"squeeze4d_dims_0_neg_1": lambda: (torch.randn(1, 5, 5, 1), (0, -1)),`
`42`	`43`	`"squeeze4d_dims_0_neg_2": lambda: (torch.randn(1, 5, 1, 5), (0, -2)),`
	`44`	`+ "squeeze5d_dims_0_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), (0, -2)),`
`43`	`45`	`}`
`44`	`46`
`45`	`47`	`def forward(self, x: torch.Tensor, dims: tuple[int]):`
`@@ -51,6 +53,7 @@ class Squeeze(torch.nn.Module):`
`51`	`53`	`"squeeze3d": lambda: (torch.randn(1, 1, 5),),`
`52`	`54`	`"squeeze4d_dims": lambda: (torch.randn(1, 5, 5, 1),),`
`53`	`55`	`"squeeze3d_dims_mix": lambda: (torch.randn(1, 5, 1, 5),),`
	`56`	`+ "squeeze4d_dims_mix": lambda: (torch.randn(1, 1, 5, 1, 5),),`
`54`	`57`	`}`
`55`	`58`
`56`	`59`	`def forward(self, x: torch.Tensor):`