From a863305fa483cddbe12299186e502ca6c3eba8cf Mon Sep 17 00:00:00 2001
From: Saoirse Stewart <saoirse.stewart@arm.com>
Date: Tue, 12 Nov 2024 15:26:34 +0000
Subject: [PATCH] Add aot_arm_compiler flag to allow the reordering of the
 inputs

* Add capability to use cmd input order in the backend
* Extend the test infrastructure to handle this
---
 backends/arm/arm_backend.py      | 31 +++++++++++++++++++++++++++++--
 backends/arm/arm_vela.py         | 15 +++++++++------
 backends/arm/test/common.py      | 32 ++++++++++++++++++++++++++------
 examples/arm/aot_arm_compiler.py | 18 ++++++++++++++++--
 examples/arm/run.sh              |  6 +++++-
 5 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 06207611e09..ad2d1e73afb 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -52,6 +52,7 @@ def __init__(self):
         self.permute_nhwc = False
         self.quantize_io = False
         self.tosa_version = None
+        self.input_order = None
 
     def ethosu_compile_spec(
         self,
@@ -134,6 +135,14 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
+    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+        """
+        Reorder the inputs coming in. This may be required when inputs > 1.
+        And while using the U55/U85 CompileSpec.
+        """
+        self.input_order = input_order
+        return self
+
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
@@ -163,6 +172,13 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("permute_memory_format", "nhwc".encode())
             )
 
+        if self.input_order:
+            self.compile_spec.append(
+                CompileSpec(
+                    "input_order", " ".join(map(str, self.input_order)).encode()
+                )
+            )
+
         if self.quantize_io:
             self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
 
@@ -214,6 +230,7 @@ def preprocess(  # noqa: C901
         artifact_path = None
         output_format = ""
         compile_flags = []
+        input_order = []
         for spec in compile_spec:
             if spec.key == "debug_artifact_path":
                 artifact_path = spec.value.decode()
@@ -221,6 +238,8 @@ def preprocess(  # noqa: C901
                 output_format = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
+            if spec.key == "input_order":
+                input_order = list(map(int, spec.value.decode().split(",")))
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -246,12 +265,14 @@ def preprocess(  # noqa: C901
         )
 
         node_visitors = get_node_visitors(edge_program, tosa_spec)
-
+        input_count = 0
         for node in graph_module.graph.nodes:
             if node.op == "call_function":
                 process_call_function(node, tosa_graph, node_visitors, tosa_spec)
             elif node.op == "placeholder":
                 process_placeholder(node, tosa_graph, edge_program, tosa_spec)
+                if node.name in edge_program.graph_signature.user_inputs:
+                    input_count += 1
             elif node.op == "output":
                 process_output(node, tosa_graph)
             else:
@@ -259,6 +280,12 @@ def preprocess(  # noqa: C901
                 # any checking of compatibility.
                 dbg_fail(node, tosa_graph, artifact_path)
 
+        if len(input_order) > 0:
+            if input_count != len(input_order):
+                raise RuntimeError(
+                    "The rank of the input order is not equal to amount of input tensors"
+                )
+
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
@@ -275,7 +302,7 @@ def preprocess(  # noqa: C901
         # preprocess and some consume TOSA fb directly.
         if output_format == "vela":
             # Emit vela_bin_stream format
-            binary = vela_compile(tosa_graph, compile_flags)
+            binary = vela_compile(tosa_graph, compile_flags, input_order)
         elif output_format == "tosa":
             # Emit TOSA flatbuffer
             binary = bytes(tosa_graph.serialize())
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index 01bb8bd55e5..918d95ba379 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -17,10 +17,13 @@
 
 # Pack either input or output tensor block, compose the related arrays into
 # per-io structs to simplify runtime use.
-def vela_bin_pack_io(prefix, data):
-    ios = struct.pack("<i", len(data[prefix + "_shape"]))
-    for i in range(len(data[prefix + "_shape"])):
-        io_shape = data[prefix + "_shape"][i]
+def vela_bin_pack_io(prefix, data, shape_order=None):
+    vela_input_shapes = data[prefix + "_shape"]
+
+    order = shape_order if shape_order else range(len(vela_input_shapes))
+    ios = struct.pack("<i", len(vela_input_shapes))
+    for i in order:
+        io_shape = vela_input_shapes[i]
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
@@ -36,7 +39,7 @@ def vela_bin_pack_io(prefix, data):
 # Output via Vela to binary stream for ArmBackendEthosU
 # WARNING: Do not change this without changing VelaBinStream.cpp as that
 #          function consumes this format and the two need to align.
-def vela_compile(tosa_graph, args: List[str]):
+def vela_compile(tosa_graph, args: List[str], shape_order=None):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_graph.serialize()
@@ -78,7 +81,7 @@ def vela_compile(tosa_graph, args: List[str]):
             bin_blocks["scratch_data"] = b"\x00" * block_length
 
             # Capture inputs and outputs
-            bin_blocks["inputs"] = vela_bin_pack_io("input", data)
+            bin_blocks["inputs"] = vela_bin_pack_io("input", data, shape_order)
             bin_blocks["outputs"] = vela_bin_pack_io("output", data)
 
             bin_blocks["vela_end_stream"] = b""
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 3a9818929b9..4b86663e6bb 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -213,29 +213,44 @@ def get_tosa_compile_spec_unbuilt(
 
 
 def get_u55_compile_spec(
-    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True,
+    quantize_io=False,
+    custom_path=None,
+    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U55 tests.
     """
     return get_u55_compile_spec_unbuilt(
-        permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path
+        permute_memory_to_nhwc,
+        quantize_io=quantize_io,
+        custom_path=custom_path,
+        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u85_compile_spec(
-    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True,
+    quantize_io=False,
+    custom_path=None,
+    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U85 tests.
     """
     return get_u85_compile_spec_unbuilt(
-        permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path
+        permute_memory_to_nhwc,
+        quantize_io=quantize_io,
+        custom_path=custom_path,
+        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u55_compile_spec_unbuilt(
-    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True,
+    quantize_io=False,
+    custom_path=None,
+    reorder_inputs=None,
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -254,12 +269,16 @@ def get_u55_compile_spec_unbuilt(
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
         .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(artifact_path)
+        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
 
 def get_u85_compile_spec_unbuilt(
-    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True,
+    quantize_io=False,
+    custom_path=None,
+    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -276,6 +295,7 @@ def get_u85_compile_spec_unbuilt(
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
         .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(artifact_path)
+        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 4953f8735e3..ddd5fd6b0bb 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -245,7 +245,9 @@ def get_calibration_data(
 
 
 def get_compile_spec(
-    target: str, intermediates: Optional[str] = None
+    target: str,
+    intermediates: Optional[str] = None,
+    reorder_inputs: Optional[str] = None,
 ) -> ArmCompileSpecBuilder:
     spec_builder = None
     if target == "TOSA":
@@ -265,6 +267,7 @@ def get_compile_spec(
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
+            .set_input_order(reorder_inputs)
         )
     elif "ethos-u85" in target:
         spec_builder = (
@@ -277,6 +280,7 @@ def get_compile_spec(
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
+            .set_input_order(reorder_inputs)
         )
 
     if intermediates is not None:
@@ -419,6 +423,14 @@ def get_args():
         required=False,
         help="Location for outputs, if not the default of cwd.",
     )
+    parser.add_argument(
+        "-r",
+        "--reorder_inputs",
+        type=str,
+        required=False,
+        default=None,
+        help="Provide the order of the inputs. This can be required when inputs > 1.",
+    )
     args = parser.parse_args()
 
     if args.evaluate and (
@@ -481,7 +493,9 @@ def get_args():
     if args.delegate:
         # As we can target multiple output encodings from ArmBackend, one must
         # be specified.
-        compile_spec = get_compile_spec(args.target, args.intermediates)
+        compile_spec = get_compile_spec(
+            args.target, args.intermediates, args.reorder_inputs
+        )
         edge = to_edge_transform_and_lower(
             exported_program,
             partitioner=[ArmPartitioner(compile_spec)],
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index e4941519449..6de8ec2d2b7 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -20,6 +20,7 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 root_dir=${script_dir}/ethos-u-scratch
 
 model_name=""
+reorder_inputs=""
 aot_arm_compiler_flags="--delegate --quantize"
 target="ethos-u55-128"
 output_folder_set=false
@@ -37,6 +38,7 @@ help() {
     echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
     echo "  --build_only                           Only build, don't run FVP"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
+    echo "  --reorder_inputs=<FLAGS>               Reorder the inputs. This can be required when inputs > 1."
     exit 0
 }
 
@@ -50,6 +52,7 @@ for arg in "$@"; do
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
       --build_only) build_only=true ;;
       --scratch-dir=*) root_dir="${arg#*=}";;
+      --reorder_inputs=*) reorder_inputs="${arg#*=}";;
       *)
       ;;
     esac
@@ -112,7 +115,7 @@ function generate_pte_file() {
     # We are using the aot_lib from build_quantization_aot_lib below
     SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library="$SO_LIB" 1>&2
+    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags}  --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library="$SO_LIB" 1>&2
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
@@ -287,6 +290,7 @@ if [[ -z "$model_name" ]]; then
 else
     test_model=( "$model_name" )
     model_compiler_flags=( "$aot_arm_compiler_flags" )
+    reorder_inputs=( "$reorder_inputs" )
 fi
 
 # loop over running the AoT flow and executing the model on device