Update default executor runner with output options

mansnils · mansnils · commit d9d24659af59 · 2025-09-05T19:58:13.000+02:00
By default not all output is printed. Adds option for printing
all output. Also adds option to print output to file.

Also update the Arm VKML unit test runner as a user that prints
output to file. Enables acos_unit test to run on Vulkan runtime that
depends on this.

Change-Id: If61c1fe89c9da004fa9db4524e1413893549abce
diff --git a/backends/arm/test/ops/test_acos.py b/backends/arm/test/ops/test_acos.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 from typing import Tuple
 
+import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -102,8 +103,12 @@ def test_acos_vgf_FP(test_data: Tuple):
         [],
         [],
         tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
     )
-    pipeline.run()
+    try:
+        pipeline.run()
+    except FileNotFoundError as e:
+        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
 
 
 @common.parametrize("test_data", test_data_suite)
@@ -115,5 +120,9 @@ def test_acos_vgf_INT(test_data: Tuple):
         [],
         [],
         tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
     )
-    pipeline.run()
+    try:
+        pipeline.run()
+    except FileNotFoundError as e:
+        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
@@ -7,6 +7,7 @@
 
 from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.quantizer import arm_quantizer
 from executorch.backends.arm.test import common, conftest
@@ -196,7 +197,10 @@ def test_add_tensor_vgf_FP(test_data: input_t1):
         tosa_version="TOSA-1.0+FP",
         run_on_vulkan_runtime=True,
     )
-    pipeline.run()
+    try:
+        pipeline.run()
+    except FileNotFoundError as e:
+        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
 
 
 @common.parametrize("test_data", Add.test_data)
@@ -210,4 +214,7 @@ def test_add_tensor_vgf_INT(test_data: input_t1):
         tosa_version="TOSA-1.0+INT",
         run_on_vulkan_runtime=True,
     )
-    pipeline.run()
+    try:
+        pipeline.run()
+    except FileNotFoundError as e:
+        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
@@ -243,6 +243,25 @@ def save_inputs_to_file(
     return input_file_paths
 
 
+def get_output_from_file(
+    exported_program: ExportedProgram,
+    intermediate_path: str | Path,
+    output_base_name: str,
+):
+    output_np = []
+    output_node = exported_program.graph_module.graph.output_node()
+    for i, node in enumerate(output_node.args[0]):
+        output_shape = node.meta["val"].shape
+        output_dtype = node.meta["val"].dtype
+        tosa_ref_output = np.fromfile(
+            os.path.join(intermediate_path, f"{output_base_name}-{i}.bin"),
+            _torch_to_numpy_dtype_dict[output_dtype],
+        )
+
+        output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape))
+    return tuple(output_np)
+
+
 def run_vkml_emulation_layer(
     executorch_program_manager: ExecutorchProgramManager,
     inputs: Tuple[torch.Tensor],
@@ -267,10 +286,13 @@ def run_vkml_emulation_layer(
     with open(pte_path, "wb") as f:
         f.write(executorch_program_manager.buffer)
 
-    input_paths = save_inputs_to_file(exported_program, inputs, intermediate_path)
+    output_base_name = "out"
+    out_path = os.path.join(intermediate_path, output_base_name)
+
+    cmd_line = f"{elf_path} -model_path {pte_path} -output_file {out_path}"
 
-    cmd_line = f"{elf_path} -model_path {pte_path}"
     input_string = None
+    input_paths = save_inputs_to_file(exported_program, inputs, intermediate_path)
     for input_path in input_paths:
         if input_string is None:
             input_string = f" -inputs={input_path}"
@@ -282,23 +304,11 @@ def run_vkml_emulation_layer(
 
     result = _run_cmd(cmd_line)
 
-    result_stdout = result.stdout.decode()  # noqa: F841
     # TODO: MLETORCH-1234: Support VGF e2e tests in VgfPipeline
     # TODO: Add regex to check for error or fault messages in stdout from Emulation Layer
-    # Regex to extract tensor values from stdout
-    output_np = []
-    matches = re.findall(
-        r"Output\s+\d+:\s+tensor\(sizes=\[(.*?)\],\s+\[(.*?)\]\)",
-        result_stdout,
-        re.DOTALL,
-    )
-
-    for shape_str, values_str in matches:
-        shape = list(map(int, shape_str.split(",")))
-        values = list(map(float, re.findall(r"[-+]?\d*\.\d+|\d+", values_str)))
-        output_np.append(torch.tensor(values).reshape(shape))
+    result_stdout = result.stdout.decode()  # noqa: F841
 
-    return tuple(output_np)
+    return get_output_from_file(exported_program, intermediate_path, output_base_name)
 
 
 def run_corstone(
@@ -342,7 +352,8 @@ def run_corstone(
 
     input_paths = save_inputs_to_file(exported_program, inputs, intermediate_path)
 
-    out_path = os.path.join(intermediate_path, "out")
+    output_base_name = "out"
+    out_path = os.path.join(intermediate_path, output_base_name)
 
     cmd_line = f"executor_runner -m {pte_path} -o {out_path}"
     for input_path in input_paths:
@@ -424,18 +435,7 @@ def run_corstone(
             f"Corstone simulation failed:\ncmd: {' '.join(command_args)}\nlog: \n {result_stdout}\n{result.stderr.decode()}"
         )
 
-    output_np = []
-    output_node = exported_program.graph_module.graph.output_node()
-    for i, node in enumerate(output_node.args[0]):
-        output_shape = node.meta["val"].shape
-        output_dtype = node.meta["val"].dtype
-        tosa_ref_output = np.fromfile(
-            os.path.join(intermediate_path, f"out-{i}.bin"),
-            _torch_to_numpy_dtype_dict[output_dtype],
-        )
-
-        output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape))
-    return tuple(output_np)
+    return get_output_from_file(exported_program, intermediate_path, output_base_name)
 
 
 def prep_data_for_save(
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -51,6 +51,15 @@ DEFINE_string(
     "model.pte",
     "Model serialized in flatbuffer format.");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
+DEFINE_string(
+    output_file,
+    "",
+    "Base name of output file. If not empty output will be written to the file(s).");
+
+DEFINE_bool(
+    print_all_output,
+    false,
+    "Prints all output. By default only first and last 100 elements are printed.");
 DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
@@ -328,10 +337,67 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   Error status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
-  // Print the first and last 100 elements of long lists of scalars.
-  std::cout << executorch::extension::evalue_edge_items(100);
-  for (int i = 0; i < outputs.size(); ++i) {
-    std::cout << "Output " << i << ": " << outputs[i] << std::endl;
+
+  if (FLAGS_output_file.size() > 0) {
+    for (int i = 0; i < outputs.size(); ++i) {
+      if (outputs[i].isTensor()) {
+        Tensor tensor = outputs[i].toTensor();
+
+        char out_filename[255];
+        snprintf(out_filename, 255, "%s-%d.bin", FLAGS_output_file.c_str(), i);
+        ET_LOG(Info, "Writing output to file: %s", out_filename);
+        FILE* out_file = fopen(out_filename, "wb");
+        auto written_size =
+            fwrite(tensor.const_data_ptr<char>(), 1, tensor.nbytes(), out_file);
+        fclose(out_file);
+      }
+    }
+  }
+
+  if (FLAGS_print_all_output) {
+    for (int i = 0; i < outputs.size(); ++i) {
+      if (outputs[i].isTensor()) {
+        Tensor tensor = outputs[i].toTensor();
+
+        for (int j = 0; j < tensor.numel(); ++j) {
+          if (tensor.scalar_type() == ScalarType::Int) {
+            printf(
+                "Output[%d][%d]: (int) %d\n",
+                i,
+                j,
+                tensor.const_data_ptr<int>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Float) {
+            printf(
+                "Output[%d][%d]: (float) %f\n",
+                i,
+                j,
+                tensor.const_data_ptr<float>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Char) {
+            printf(
+                "Output[%d][%d]: (char) %d\n",
+                i,
+                j,
+                tensor.const_data_ptr<int8_t>()[j]);
+          } else if (tensor.scalar_type() == ScalarType::Bool) {
+            printf(
+                "Output[%d][%d]: (bool) %s (0x%x)\n",
+                i,
+                j,
+                tensor.const_data_ptr<int8_t>()[j] ? "true " : "false",
+                tensor.const_data_ptr<int8_t>()[j]);
+          }
+        }
+      } else {
+        printf("Output[%d]: Not Tensor\n", i);
+      }
+    }
+  } else {
+    // Print the first and last 100 elements of long lists of scalars.
+    std::cout << executorch::extension::evalue_edge_items(100);
+
+    for (int i = 0; i < outputs.size(); ++i) {
+      std::cout << "OutputX " << i << ": " << outputs[i] << std::endl;
+    }
   }
 
   if (tracer.get_event_tracer()) {
diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp
@@ -31,15 +31,15 @@ Result<BufferCleanup> prepare_input_tensors(
   size_t num_inputs = method_meta.num_inputs();
   bool hard_code_inputs_to_ones = true;
 
-  ET_CHECK_OR_RETURN_ERROR(
-      input_buffers.size() > 0 && num_inputs == input_buffers.size(),
-      InvalidArgument,
-      "Wrong number of inputs allocated compared to method  %zu ? %zu",
-      num_inputs,
-      input_buffers.size());
-
   if (input_buffers.size() > 0) {
     hard_code_inputs_to_ones = false;
+
+    ET_CHECK_OR_RETURN_ERROR(
+        num_inputs == input_buffers.size(),
+        InvalidArgument,
+        "Wrong number of inputs allocated compared to method  %zu ? %zu",
+        num_inputs,
+        input_buffers.size());
   }
 
   // A large number of small allocations could exhaust the heap even if the