patch for runner

haowhsu-quic · haowhsu-quic · commit 13fdca787e98 · 2025-06-27T10:04:41.000+08:00
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,211 @@
+import argparse
+import os
+import subprocess
+
+qnn_sdk = os.getenv("QNN_SDK_ROOT")
+htp_arch = "79"
+workspace = "/data/local/tmp/et_ga_benchmark"
+memory_script_file = "peak_memory.sh"
+perf_file = "statistics.txt"
+
+
+def get_artifacts(backend, pte_path):
+    def get_build_dir(backend):
+        build_dir = {
+            "qnn": "build-android",
+            "xnn": "build-xnnpack",
+        }
+        return build_dir[backend]
+
+    memory_script = """$@ 2> /dev/null &
+
+PROCESS=$1
+PEAK_MEM=0
+SAMPLES=0
+TOTAL=0
+while true; do
+    PID=$(pidof $PROCESS)
+    if [ "$PID" != "" ]; then
+        DMA=$(dmabuf_dump $PID | grep "PROCESS TOTAL" | awk '{ print $3 }')
+        PSS=$(dumpsys meminfo -s $PID | grep "TOTAL PSS" | awk '{ print $3 }')
+        if [ "$PSS" == "" ]; then
+            continue
+        fi
+        CURRENT=$(($DMA+$PSS))
+        if [ CURRENT -gt PEAK_MEM ]; then
+            PEAK_MEM=$CURRENT
+        fi
+        SAMPLES=$(($SAMPLES+1))
+        TOTAL=$(($TOTAL+$CURRENT))
+    else
+        break
+    fi
+done
+
+rm -rf memory_usage.txt
+echo "peak_mem: $PEAK_MEM" >> statistics.txt
+AVG_MEM=$(awk -- 'BEGIN{printf "%.3f", ARGV[1]/ARGV[2]}' "$TOTAL" "$SAMPLES")
+echo "avg_mem: $AVG_MEM" >> statistics.txt
+    """
+    with open(memory_script_file, "w") as f:
+        f.write(memory_script)
+
+    runner = {
+        "qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner",
+        "xnn": f"{get_build_dir(backend)}/backends/xnnpack/xnn_executor_runner",
+    }
+    artifacts = {
+        "qnn": [
+            pte_path,
+            f"{qnn_sdk}/lib/aarch64-android/libQnnHtp.so",
+            (
+                f"{qnn_sdk}/lib/hexagon-v{htp_arch}/"
+                f"unsigned/libQnnHtpV{htp_arch}Skel.so"
+            ),
+            (f"{qnn_sdk}/lib/aarch64-android/" f"libQnnHtpV{htp_arch}Stub.so"),
+            f"{qnn_sdk}/lib/aarch64-android/libQnnHtpPrepare.so",
+            f"{qnn_sdk}/lib/aarch64-android/libQnnSystem.so",
+            f"{get_build_dir(backend)}/backends/qualcomm/libqnn_executorch_backend.so",
+            f"{qnn_sdk}/lib/aarch64-android/libQnnModelDlc.so",
+            runner[backend],
+            memory_script_file,
+        ],
+        "xnn": [
+            pte_path,
+            runner[backend],
+            memory_script_file,
+        ],
+    }
+    return artifacts[backend]
+
+
+def get_cmds(backend, pte_path, iteration):
+    cmd_args = {
+        "qnn": (
+            [
+                f"--model_path {os.path.basename(pte_path)}",
+                f"--iteration {iteration}",
+                "--dump_statistics",
+            ]
+        ),
+        "xnn": (
+            [
+                f"--model_path {os.path.basename(pte_path)}",
+                f"--num_executions {iteration}",
+                "--dump_statistics",
+            ]
+        ),
+    }
+    cmds_for_inference = {
+        "qnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./qnn_executor_runner &&",
+                    f"./qnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+        "xnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./xnn_executor_runner &&",
+                    f"./xnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+    }
+    # do not dump inference metrics during profiling memory
+    for _, v in cmd_args.items():
+        v.pop()
+    cmds_for_memory = {
+        "qnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./qnn_executor_runner &&",
+                    f"chmod +x {memory_script_file} &&",
+                    f"./{memory_script_file} ./qnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+        "xnn": (
+            " ".join(
+                [
+                    f"cd {workspace} &&",
+                    "chmod +x ./xnn_executor_runner &&",
+                    f"chmod +x {memory_script_file} &&",
+                    f"./{memory_script_file} ./xnn_executor_runner {' '.join(cmd_args[backend])}",
+                ]
+            )
+        ),
+    }
+    return [cmds_for_inference[backend], cmds_for_memory[backend]]
+
+
+def start_benchmark(artifacts, cmds, device, host):
+    def adb(action):
+        if not host:
+            actions = ["adb", "-s", device]
+        else:
+            actions = ["adb", "-H", host, "-s", device]
+        actions.extend(action)
+        subprocess.run(actions, stdout=subprocess.DEVNULL)
+
+    def post_process():
+        subprocess.run(["rm", "-rf", perf_file], stdout=subprocess.DEVNULL)
+        for file_name in [perf_file]:
+            adb(["pull", f"{workspace}/{file_name}", "."])
+            with open(file_name, "r") as f:
+                print(f.read())
+
+    adb(["shell", "rm", "-rf", workspace])
+    adb(["shell", "mkdir", "-p", workspace])
+    for artifact in artifacts:
+        adb(["push", artifact, workspace])
+    for cmd in cmds:
+        adb(["shell", cmd])
+    post_process()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-b",
+        "--backend",
+        help="either 'qnn' or 'xnn'",
+        required=True,
+    )
+    parser.add_argument(
+        "-p",
+        "--pte",
+        help="path to generated .pte file",
+        required=True,
+    )
+    parser.add_argument(
+        "-H",
+        "--host",
+        help="hostname for adb gateway",
+        required=False,
+    )
+    parser.add_argument(
+        "-s",
+        "--device",
+        help="serial number for adb device",
+        required=True,
+    )
+    parser.add_argument(
+        "-i",
+        "--iteration",
+        help="total number of inferences",
+        default=100,
+        required=False,
+    )
+    args = parser.parse_args()
+    start_benchmark(
+        artifacts=get_artifacts(args.backend, args.pte),
+        cmds=get_cmds(args.backend, args.pte, args.iteration),
+        device=args.device,
+        host=args.host,
+    )
diff --git a/build_xnnpack.sh b/build_xnnpack.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+if [[ -z $ANDROID_NDK_ROOT ]]; then
+  echo "Please export ANDROID_NDK_ROOT=/path/to/ndk"
+  exit -1
+fi
+
+CLEAN_BUILD="false"
+BUILD_FOLDER="build-xnnpack"
+BUILD_TYPE="release"
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -c|--clean_build) CLEAN_BUILD="true"; shift;;
+    -d|--debug) BUILD_TYPE="Debug"; shift;;
+    *) echo "unknow arg passed: $1"; exit 1;;
+  esac
+  shift
+done
+
+if [ "$CLEAN_BUILD" = true ]; then
+  rm -rf $BUILD_FOLDER
+fi
+
+cmake \
+  -DCMAKE_INSTALL_PREFIX=$BUILD_FOLDER \
+  -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI='arm64-v8a' \
+  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_XNNPACK=ON \
+  -DEXECUTORCH_ENABLE_LOGGING=ON \
+  -DPYTHON_EXECUTABLE=python \
+  -B$BUILD_FOLDER .
+
+cmake --build $BUILD_FOLDER -j9 --target install --config $BUILD_TYPE
+
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -18,6 +18,8 @@
  * all fp32 tensors.
  */
 
+#include <chrono>
+#include <fstream>
 #include <iostream>
 #include <memory>
 
@@ -57,6 +59,7 @@ DEFINE_int32(
     cpu_threads,
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+DEFINE_bool(dump_statistics, false, "Dump inference statistics.");
 
 using executorch::extension::FileDataLoader;
 using executorch::runtime::Error;
@@ -241,65 +244,78 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   EventTraceManager tracer;
+  auto before_load = std::chrono::high_resolution_clock::now();
   Result<Method> method = program->load_method(
       method_name, &memory_manager, tracer.get_event_tracer());
+  auto after_load = std::chrono::high_resolution_clock::now();
+  double interval_load =
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          after_load - before_load)
+          .count() /
+      1000.0;
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
       method_name,
       (uint32_t)method.error());
   ET_LOG(Info, "Method loaded.");
 
-  et_timestamp_t time_spent_executing = 0;
+  et_timestamp_t time_spent_executing = 0, time_spent_executing_1st = 0;
+  auto inputs = executorch::extension::prepare_input_tensors(*method);
+  ET_LOG(Debug, "Preparing inputs.");
+  // Allocate input tensors and set all of their elements to 1. The `inputs`
+  // variable owns the allocated memory and must live past the last call to
+  // `execute()`.
+  //
+  // NOTE: we have to re-prepare input tensors on every execution
+  // because inputs whose space gets reused by memory planning (if
+  // any such inputs exist) will not be preserved for the next
+  // execution.
+
+  ET_CHECK_MSG(
+      inputs.ok(),
+      "Could not prepare inputs: 0x%" PRIx32,
+      (uint32_t)inputs.error());
+  ET_LOG(Debug, "Inputs prepared.");
+  auto before_exec = std::chrono::high_resolution_clock::now();
+  Error status = method->execute();
+  auto after_exec = std::chrono::high_resolution_clock::now();
+  double interval_1st_infs =
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          after_exec - before_exec)
+          .count() /
+      1000.0;
+  ET_CHECK_MSG(
+      status == Error::Ok,
+      "Execution of method %s failed with status 0x%" PRIx32,
+      method_name,
+      (uint32_t)status);
+
   // Run the model.
+  before_exec = std::chrono::high_resolution_clock::now();
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
-    ET_LOG(Debug, "Preparing inputs.");
-    // Allocate input tensors and set all of their elements to 1. The `inputs`
-    // variable owns the allocated memory and must live past the last call to
-    // `execute()`.
-    //
-    // NOTE: we have to re-prepare input tensors on every execution
-    // because inputs whose space gets reused by memory planning (if
-    // any such inputs exist) will not be preserved for the next
-    // execution.
-    auto inputs = executorch::extension::prepare_input_tensors(*method);
-    ET_CHECK_MSG(
-        inputs.ok(),
-        "Could not prepare inputs: 0x%" PRIx32,
-        (uint32_t)inputs.error());
-    ET_LOG(Debug, "Inputs prepared.");
-
-    const et_timestamp_t before_execute =
-        executorch::runtime::pal_current_ticks();
-    Error status = method->execute();
-    const et_timestamp_t after_execute =
-        executorch::runtime::pal_current_ticks();
-    time_spent_executing += after_execute - before_execute;
+    status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
         (uint32_t)status);
   }
-  const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
-  constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
-  ET_LOG(
-      Info,
-      "Model executed successfully %" PRIu32 " time(s) in %f ms.",
-      FLAGS_num_executions,
-      static_cast<double>(time_spent_executing) * tick_ratio.numerator /
-          tick_ratio.denominator / NANOSECONDS_PER_MILLISECOND);
-
-  // Print the outputs.
-  std::vector<EValue> outputs(method->outputs_size());
-  ET_LOG(Info, "%zu outputs: ", outputs.size());
-  Error status = method->get_outputs(outputs.data(), outputs.size());
-  ET_CHECK(status == Error::Ok);
-  // Print the first and last 100 elements of long lists of scalars.
-  std::cout << executorch::extension::evalue_edge_items(100);
-  for (int i = 0; i < outputs.size(); ++i) {
-    std::cout << "Output " << i << ": " << outputs[i] << std::endl;
+  after_exec = std::chrono::high_resolution_clock::now();
+  double interval_infs = std::chrono::duration_cast<std::chrono::microseconds>(
+                             after_exec - before_exec)
+                             .count() /
+      1000.0 / FLAGS_num_executions;
+
+  if (FLAGS_dump_statistics) {
+    auto output_file_name = "statistics.txt";
+    std::ofstream fout(output_file_name);
+    fout << "load: " + std::to_string(interval_load)
+         << "\n1st: " + std::to_string(interval_1st_infs)
+         << "\navg: " + std::to_string(interval_infs) << std::endl;
+    fout.close();
   }
+  ET_LOG(Info, "Model executed successfully.");
 
   if (tracer.get_event_tracer()) {
     // Dump ETDump data containing profiling/debugging data to file specified in
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp