From 1df6f972dc23d80fa94865b13bb8ef6b4762464a Mon Sep 17 00:00:00 2001
From: Benjamin Klimczak <benjamin.klimczak@arm.com>
Date: Thu, 27 Jun 2024 15:52:32 +0100
Subject: [PATCH 1/2] Add event tracing and ETDumps to executor_runner

- Enabled via EXECUTORCH_ENABLE_EVENT_TRACER
- Add flag 'etdump_path' to specify the file path for the ETDump file
- Add flag 'num_executions' for number of iterations to run
- Create and pass event tracer 'ETDumpGen'
- Save ETDump to disk
- Update docs to reflect the changes

Signed-off-by: Benjamin Klimczak <benjamin.klimczak@arm.com>
Change-Id: I876d5138455d1b04fba9af4016d8341e8866f9c0
---
 CMakeLists.txt                                |  5 ++
 backends/xnnpack/CMakeLists.txt               |  8 ++-
 ...e-delegates-executorch-xnnpack-delegate.md |  2 +-
 .../tutorial-xnnpack-delegate-lowering.md     |  3 +
 .../executor_runner/executor_runner.cpp       | 60 +++++++++++++++----
 5 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4a3322c24d..c5b9ae35ce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -581,6 +582,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs portable_ops_lib)
   endif()
 
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    list(APPEND _executor_runner_libs etdump flatccrt)
+  endif()
+
   # Generate lib to register quantized ops
   if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
     list(APPEND _executor_runner_libs quantized_ops_lib)
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 1ac7867f3c0..90c23c5aa07 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -113,8 +114,13 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   #
   list(TRANSFORM _xnn_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
   add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs})
+
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    list(APPEND xnn_executor_runner_libs etdump)
+  endif()
+
   target_link_libraries(
-    xnn_executor_runner xnnpack_backend gflags portable_ops_lib
+    xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs}
   )
   target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
 endif()
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 1d12daef9d8..42d4eeb3b10 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information.
+We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 4491a6e8c80..58fcf474ba5 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -171,3 +171,6 @@ Now you should be able to find the executable built at `./cmake-out/backends/xnn
 
 ## Building and Linking with the XNNPACK Backend
 You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](demo-apps-android.md) next.
+
+## Profiling
+To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_SDK=ON` to the build command. This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details).
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index a0644487d23..5271d204d7c 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright 2024 Arm Limited and/or its affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
@@ -29,6 +30,9 @@
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
+#ifdef ET_EVENT_TRACER_ENABLED
+#include <executorch/sdk/etdump/etdump_flatcc.h>
+#endif // ET_EVENT_TRACER_ENABLED
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
@@ -36,6 +40,13 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+#ifdef ET_EVENT_TRACER_ENABLED
+DEFINE_string(
+    etdump_path,
+    "model.etdump",
+    "If ETDump generation is enabled an ETDump will be written out to this path.");
+DEFINE_uint32(num_executions, 10, "Number of times to run the model.");
+#endif // ET_EVENT_TRACER_ENABLED
 
 using namespace torch::executor;
 using torch::executor::util::FileDataLoader;
@@ -142,8 +153,22 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-
-  Result<Method> method = program->load_method(method_name, &memory_manager);
+  uint32_t num_executions = 1;
+  EventTracer* event_tracer_ptr = nullptr;
+#ifdef ET_EVENT_TRACER_ENABLED
+  std::unique_ptr<FILE, decltype(&fclose)> etdump_file(
+      fopen(FLAGS_etdump_path.c_str(), "w+"), fclose);
+  ET_CHECK_MSG(
+      etdump_file,
+      "Failed to open ETDump file at %s.",
+      FLAGS_etdump_path.c_str());
+
+  num_executions = FLAGS_num_executions;
+  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  event_tracer_ptr = &etdump_gen;
+#endif // ET_EVENT_TRACER_ENABLED
+  Result<Method> method =
+      program->load_method(method_name, &memory_manager, event_tracer_ptr);
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
@@ -162,18 +187,20 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "Inputs prepared.");
 
   // Run the model.
-  Error status = method->execute();
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "Execution of method %s failed with status 0x%" PRIx32,
-      method_name,
-      (uint32_t)status);
-  ET_LOG(Info, "Model executed successfully.");
+  for (uint32_t i = 0; i < num_executions; i++) {
+    Error status = method->execute();
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Execution of method %s failed with status 0x%" PRIx32,
+        method_name,
+        (uint32_t)status);
+  }
+  ET_LOG(Info, "Model executed successfully %i time(s).", num_executions);
 
   // Print the outputs.
   std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
-  status = method->get_outputs(outputs.data(), outputs.size());
+  Error status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
   std::cout << torch::executor::util::evalue_edge_items(100);
@@ -181,5 +208,18 @@ int main(int argc, char** argv) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+  // Dump the ETDump data containing profiling/debugging data to the specified
+  // file.
+  etdump_result result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    fwrite((uint8_t*)result.buf, 1, result.size, etdump_file.get());
+    free(result.buf);
+    ET_LOG(Info, "ETDump written to file '%s'.", FLAGS_etdump_path.c_str());
+  } else {
+    ET_LOG(Error, "No ETDump data available!");
+  }
+#endif // ET_EVENT_TRACER_ENABLED
+
   return 0;
 }

From 50b711e51ff7a2ded7d8aaa1bbcd6a0a025743d0 Mon Sep 17 00:00:00 2001
From: Benjamin Klimczak <benjamin.klimczak@arm.com>
Date: Wed, 14 Aug 2024 12:37:59 +0100
Subject: [PATCH 2/2] Minor changes to event tracing in executor_runner

- Make flag 'num_executions' available in the executor_runner
  irrespective of the event tracing
- Update docs to explain usage of 'ENABLE_XNNPACK_PROFILING' for
  additional profiling info

Signed-off-by: Benjamin Klimczak <benjamin.klimczak@arm.com>
Change-Id: I35abbd2d913880cb129bddb80514992f4dd84004
---
 .../native-delegates-executorch-xnnpack-delegate.md       | 2 +-
 docs/source/tutorial-xnnpack-delegate-lowering.md         | 2 +-
 examples/portable/executor_runner/executor_runner.cpp     | 8 +++-----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 42d4eeb3b10..8d9ec45c620 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 58fcf474ba5..e97de48d3b0 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -173,4 +173,4 @@ Now you should be able to find the executable built at `./cmake-out/backends/xnn
 You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](demo-apps-android.md) next.
 
 ## Profiling
-To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_SDK=ON` to the build command. This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details).
+To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_SDK=ON` to the build command (add `-DENABLE_XNNPACK_PROFILING=ON` for additional details). This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details).
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 5271d204d7c..132f85feb1f 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -40,12 +40,12 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(
     etdump_path,
     "model.etdump",
     "If ETDump generation is enabled an ETDump will be written out to this path.");
-DEFINE_uint32(num_executions, 10, "Number of times to run the model.");
 #endif // ET_EVENT_TRACER_ENABLED
 
 using namespace torch::executor;
@@ -153,7 +153,6 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-  uint32_t num_executions = 1;
   EventTracer* event_tracer_ptr = nullptr;
 #ifdef ET_EVENT_TRACER_ENABLED
   std::unique_ptr<FILE, decltype(&fclose)> etdump_file(
@@ -163,7 +162,6 @@ int main(int argc, char** argv) {
       "Failed to open ETDump file at %s.",
       FLAGS_etdump_path.c_str());
 
-  num_executions = FLAGS_num_executions;
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
   event_tracer_ptr = &etdump_gen;
 #endif // ET_EVENT_TRACER_ENABLED
@@ -187,7 +185,7 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "Inputs prepared.");
 
   // Run the model.
-  for (uint32_t i = 0; i < num_executions; i++) {
+  for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
     Error status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,
@@ -195,7 +193,7 @@ int main(int argc, char** argv) {
         method_name,
         (uint32_t)status);
   }
-  ET_LOG(Info, "Model executed successfully %i time(s).", num_executions);
+  ET_LOG(Info, "Model executed successfully %i time(s).", FLAGS_num_executions);
 
   // Print the outputs.
   std::vector<EValue> outputs(method->outputs_size());