Skip to content

Commit 13fdca7

Browse files
committed
patch for runner
1 parent 9c9f665 commit 13fdca7

File tree

4 files changed

+351
-41
lines changed

4 files changed

+351
-41
lines changed

benchmark.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import argparse
2+
import os
3+
import subprocess
4+
5+
qnn_sdk = os.getenv("QNN_SDK_ROOT")
6+
htp_arch = "79"
7+
workspace = "/data/local/tmp/et_ga_benchmark"
8+
memory_script_file = "peak_memory.sh"
9+
perf_file = "statistics.txt"
10+
11+
12+
def get_artifacts(backend, pte_path):
13+
def get_build_dir(backend):
14+
build_dir = {
15+
"qnn": "build-android",
16+
"xnn": "build-xnnpack",
17+
}
18+
return build_dir[backend]
19+
20+
memory_script = """$@ 2> /dev/null &
21+
22+
PROCESS=$1
23+
PEAK_MEM=0
24+
SAMPLES=0
25+
TOTAL=0
26+
while true; do
27+
PID=$(pidof $PROCESS)
28+
if [ "$PID" != "" ]; then
29+
DMA=$(dmabuf_dump $PID | grep "PROCESS TOTAL" | awk '{ print $3 }')
30+
PSS=$(dumpsys meminfo -s $PID | grep "TOTAL PSS" | awk '{ print $3 }')
31+
if [ "$PSS" == "" ]; then
32+
continue
33+
fi
34+
CURRENT=$(($DMA+$PSS))
35+
if [ CURRENT -gt PEAK_MEM ]; then
36+
PEAK_MEM=$CURRENT
37+
fi
38+
SAMPLES=$(($SAMPLES+1))
39+
TOTAL=$(($TOTAL+$CURRENT))
40+
else
41+
break
42+
fi
43+
done
44+
45+
rm -rf memory_usage.txt
46+
echo "peak_mem: $PEAK_MEM" >> statistics.txt
47+
AVG_MEM=$(awk -- 'BEGIN{printf "%.3f", ARGV[1]/ARGV[2]}' "$TOTAL" "$SAMPLES")
48+
echo "avg_mem: $AVG_MEM" >> statistics.txt
49+
"""
50+
with open(memory_script_file, "w") as f:
51+
f.write(memory_script)
52+
53+
runner = {
54+
"qnn": f"{get_build_dir(backend)}/examples/qualcomm/executor_runner/qnn_executor_runner",
55+
"xnn": f"{get_build_dir(backend)}/backends/xnnpack/xnn_executor_runner",
56+
}
57+
artifacts = {
58+
"qnn": [
59+
pte_path,
60+
f"{qnn_sdk}/lib/aarch64-android/libQnnHtp.so",
61+
(
62+
f"{qnn_sdk}/lib/hexagon-v{htp_arch}/"
63+
f"unsigned/libQnnHtpV{htp_arch}Skel.so"
64+
),
65+
(f"{qnn_sdk}/lib/aarch64-android/" f"libQnnHtpV{htp_arch}Stub.so"),
66+
f"{qnn_sdk}/lib/aarch64-android/libQnnHtpPrepare.so",
67+
f"{qnn_sdk}/lib/aarch64-android/libQnnSystem.so",
68+
f"{get_build_dir(backend)}/backends/qualcomm/libqnn_executorch_backend.so",
69+
f"{qnn_sdk}/lib/aarch64-android/libQnnModelDlc.so",
70+
runner[backend],
71+
memory_script_file,
72+
],
73+
"xnn": [
74+
pte_path,
75+
runner[backend],
76+
memory_script_file,
77+
],
78+
}
79+
return artifacts[backend]
80+
81+
82+
def get_cmds(backend, pte_path, iteration):
83+
cmd_args = {
84+
"qnn": (
85+
[
86+
f"--model_path {os.path.basename(pte_path)}",
87+
f"--iteration {iteration}",
88+
"--dump_statistics",
89+
]
90+
),
91+
"xnn": (
92+
[
93+
f"--model_path {os.path.basename(pte_path)}",
94+
f"--num_executions {iteration}",
95+
"--dump_statistics",
96+
]
97+
),
98+
}
99+
cmds_for_inference = {
100+
"qnn": (
101+
" ".join(
102+
[
103+
f"cd {workspace} &&",
104+
"chmod +x ./qnn_executor_runner &&",
105+
f"./qnn_executor_runner {' '.join(cmd_args[backend])}",
106+
]
107+
)
108+
),
109+
"xnn": (
110+
" ".join(
111+
[
112+
f"cd {workspace} &&",
113+
"chmod +x ./xnn_executor_runner &&",
114+
f"./xnn_executor_runner {' '.join(cmd_args[backend])}",
115+
]
116+
)
117+
),
118+
}
119+
# do not dump inference metrics during profiling memory
120+
for _, v in cmd_args.items():
121+
v.pop()
122+
cmds_for_memory = {
123+
"qnn": (
124+
" ".join(
125+
[
126+
f"cd {workspace} &&",
127+
"chmod +x ./qnn_executor_runner &&",
128+
f"chmod +x {memory_script_file} &&",
129+
f"./{memory_script_file} ./qnn_executor_runner {' '.join(cmd_args[backend])}",
130+
]
131+
)
132+
),
133+
"xnn": (
134+
" ".join(
135+
[
136+
f"cd {workspace} &&",
137+
"chmod +x ./xnn_executor_runner &&",
138+
f"chmod +x {memory_script_file} &&",
139+
f"./{memory_script_file} ./xnn_executor_runner {' '.join(cmd_args[backend])}",
140+
]
141+
)
142+
),
143+
}
144+
return [cmds_for_inference[backend], cmds_for_memory[backend]]
145+
146+
147+
def start_benchmark(artifacts, cmds, device, host):
148+
def adb(action):
149+
if not host:
150+
actions = ["adb", "-s", device]
151+
else:
152+
actions = ["adb", "-H", host, "-s", device]
153+
actions.extend(action)
154+
subprocess.run(actions, stdout=subprocess.DEVNULL)
155+
156+
def post_process():
157+
subprocess.run(["rm", "-rf", perf_file], stdout=subprocess.DEVNULL)
158+
for file_name in [perf_file]:
159+
adb(["pull", f"{workspace}/{file_name}", "."])
160+
with open(file_name, "r") as f:
161+
print(f.read())
162+
163+
adb(["shell", "rm", "-rf", workspace])
164+
adb(["shell", "mkdir", "-p", workspace])
165+
for artifact in artifacts:
166+
adb(["push", artifact, workspace])
167+
for cmd in cmds:
168+
adb(["shell", cmd])
169+
post_process()
170+
171+
172+
if __name__ == "__main__":
173+
parser = argparse.ArgumentParser()
174+
parser.add_argument(
175+
"-b",
176+
"--backend",
177+
help="either 'qnn' or 'xnn'",
178+
required=True,
179+
)
180+
parser.add_argument(
181+
"-p",
182+
"--pte",
183+
help="path to generated .pte file",
184+
required=True,
185+
)
186+
parser.add_argument(
187+
"-H",
188+
"--host",
189+
help="hostname for adb gateway",
190+
required=False,
191+
)
192+
parser.add_argument(
193+
"-s",
194+
"--device",
195+
help="serial number for adb device",
196+
required=True,
197+
)
198+
parser.add_argument(
199+
"-i",
200+
"--iteration",
201+
help="total number of inferences",
202+
default=100,
203+
required=False,
204+
)
205+
args = parser.parse_args()
206+
start_benchmark(
207+
artifacts=get_artifacts(args.backend, args.pte),
208+
cmds=get_cmds(args.backend, args.pte, args.iteration),
209+
device=args.device,
210+
host=args.host,
211+
)

build_xnnpack.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
3+
if [[ -z $ANDROID_NDK_ROOT ]]; then
4+
echo "Please export ANDROID_NDK_ROOT=/path/to/ndk"
5+
exit -1
6+
fi
7+
8+
CLEAN_BUILD="false"
9+
BUILD_FOLDER="build-xnnpack"
10+
BUILD_TYPE="release"
11+
12+
while [[ "$#" -gt 0 ]]; do
13+
case "$1" in
14+
-c|--clean_build) CLEAN_BUILD="true"; shift;;
15+
-d|--debug) BUILD_TYPE="Debug"; shift;;
16+
*) echo "unknow arg passed: $1"; exit 1;;
17+
esac
18+
shift
19+
done
20+
21+
if [ "$CLEAN_BUILD" = true ]; then
22+
rm -rf $BUILD_FOLDER
23+
fi
24+
25+
cmake \
26+
-DCMAKE_INSTALL_PREFIX=$BUILD_FOLDER \
27+
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
28+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
29+
-DANDROID_ABI='arm64-v8a' \
30+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
31+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
32+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
33+
-DEXECUTORCH_BUILD_XNNPACK=ON \
34+
-DEXECUTORCH_ENABLE_LOGGING=ON \
35+
-DPYTHON_EXECUTABLE=python \
36+
-B$BUILD_FOLDER .
37+
38+
cmake --build $BUILD_FOLDER -j9 --target install --config $BUILD_TYPE
39+

examples/portable/executor_runner/executor_runner.cpp

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
* all fp32 tensors.
1919
*/
2020

21+
#include <chrono>
22+
#include <fstream>
2123
#include <iostream>
2224
#include <memory>
2325

@@ -57,6 +59,7 @@ DEFINE_int32(
5759
cpu_threads,
5860
-1,
5961
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
62+
DEFINE_bool(dump_statistics, false, "Dump inference statistics.");
6063

6164
using executorch::extension::FileDataLoader;
6265
using executorch::runtime::Error;
@@ -241,65 +244,78 @@ int main(int argc, char** argv) {
241244
// be used by a single thread at at time, but it can be reused.
242245
//
243246
EventTraceManager tracer;
247+
auto before_load = std::chrono::high_resolution_clock::now();
244248
Result<Method> method = program->load_method(
245249
method_name, &memory_manager, tracer.get_event_tracer());
250+
auto after_load = std::chrono::high_resolution_clock::now();
251+
double interval_load =
252+
std::chrono::duration_cast<std::chrono::microseconds>(
253+
after_load - before_load)
254+
.count() /
255+
1000.0;
246256
ET_CHECK_MSG(
247257
method.ok(),
248258
"Loading of method %s failed with status 0x%" PRIx32,
249259
method_name,
250260
(uint32_t)method.error());
251261
ET_LOG(Info, "Method loaded.");
252262

253-
et_timestamp_t time_spent_executing = 0;
263+
et_timestamp_t time_spent_executing = 0, time_spent_executing_1st = 0;
264+
auto inputs = executorch::extension::prepare_input_tensors(*method);
265+
ET_LOG(Debug, "Preparing inputs.");
266+
// Allocate input tensors and set all of their elements to 1. The `inputs`
267+
// variable owns the allocated memory and must live past the last call to
268+
// `execute()`.
269+
//
270+
// NOTE: we have to re-prepare input tensors on every execution
271+
// because inputs whose space gets reused by memory planning (if
272+
// any such inputs exist) will not be preserved for the next
273+
// execution.
274+
275+
ET_CHECK_MSG(
276+
inputs.ok(),
277+
"Could not prepare inputs: 0x%" PRIx32,
278+
(uint32_t)inputs.error());
279+
ET_LOG(Debug, "Inputs prepared.");
280+
auto before_exec = std::chrono::high_resolution_clock::now();
281+
Error status = method->execute();
282+
auto after_exec = std::chrono::high_resolution_clock::now();
283+
double interval_1st_infs =
284+
std::chrono::duration_cast<std::chrono::microseconds>(
285+
after_exec - before_exec)
286+
.count() /
287+
1000.0;
288+
ET_CHECK_MSG(
289+
status == Error::Ok,
290+
"Execution of method %s failed with status 0x%" PRIx32,
291+
method_name,
292+
(uint32_t)status);
293+
254294
// Run the model.
295+
before_exec = std::chrono::high_resolution_clock::now();
255296
for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
256-
ET_LOG(Debug, "Preparing inputs.");
257-
// Allocate input tensors and set all of their elements to 1. The `inputs`
258-
// variable owns the allocated memory and must live past the last call to
259-
// `execute()`.
260-
//
261-
// NOTE: we have to re-prepare input tensors on every execution
262-
// because inputs whose space gets reused by memory planning (if
263-
// any such inputs exist) will not be preserved for the next
264-
// execution.
265-
auto inputs = executorch::extension::prepare_input_tensors(*method);
266-
ET_CHECK_MSG(
267-
inputs.ok(),
268-
"Could not prepare inputs: 0x%" PRIx32,
269-
(uint32_t)inputs.error());
270-
ET_LOG(Debug, "Inputs prepared.");
271-
272-
const et_timestamp_t before_execute =
273-
executorch::runtime::pal_current_ticks();
274-
Error status = method->execute();
275-
const et_timestamp_t after_execute =
276-
executorch::runtime::pal_current_ticks();
277-
time_spent_executing += after_execute - before_execute;
297+
status = method->execute();
278298
ET_CHECK_MSG(
279299
status == Error::Ok,
280300
"Execution of method %s failed with status 0x%" PRIx32,
281301
method_name,
282302
(uint32_t)status);
283303
}
284-
const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
285-
constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
286-
ET_LOG(
287-
Info,
288-
"Model executed successfully %" PRIu32 " time(s) in %f ms.",
289-
FLAGS_num_executions,
290-
static_cast<double>(time_spent_executing) * tick_ratio.numerator /
291-
tick_ratio.denominator / NANOSECONDS_PER_MILLISECOND);
292-
293-
// Print the outputs.
294-
std::vector<EValue> outputs(method->outputs_size());
295-
ET_LOG(Info, "%zu outputs: ", outputs.size());
296-
Error status = method->get_outputs(outputs.data(), outputs.size());
297-
ET_CHECK(status == Error::Ok);
298-
// Print the first and last 100 elements of long lists of scalars.
299-
std::cout << executorch::extension::evalue_edge_items(100);
300-
for (int i = 0; i < outputs.size(); ++i) {
301-
std::cout << "Output " << i << ": " << outputs[i] << std::endl;
304+
after_exec = std::chrono::high_resolution_clock::now();
305+
double interval_infs = std::chrono::duration_cast<std::chrono::microseconds>(
306+
after_exec - before_exec)
307+
.count() /
308+
1000.0 / FLAGS_num_executions;
309+
310+
if (FLAGS_dump_statistics) {
311+
auto output_file_name = "statistics.txt";
312+
std::ofstream fout(output_file_name);
313+
fout << "load: " + std::to_string(interval_load)
314+
<< "\n1st: " + std::to_string(interval_1st_infs)
315+
<< "\navg: " + std::to_string(interval_infs) << std::endl;
316+
fout.close();
302317
}
318+
ET_LOG(Info, "Model executed successfully.");
303319

304320
if (tracer.get_event_tracer()) {
305321
// Dump ETDump data containing profiling/debugging data to file specified in

0 commit comments

Comments
 (0)