Skip to content

Commit 165d29c

Browse files
committed
Arm backend: Add devtools support to example
New flags on run.sh --etdump Build in etdump and profiling, the etdump base64 coded and put in the log --debug_build Build debug instead of release --extra_build_flags Extra flags to pass to cmake this makes it for example possible to override the allocator pool size or other build time cmake flags. The devtools build has been updated so FLATCC_EXECUTABLE can be used to point out the executable. Signed-off-by: Zingo Andersen <[email protected]> Change-Id: Ic0fb1e48ee633c5fe91473bdc2db9e894b2fc4fa
1 parent eae0b04 commit 165d29c

File tree

8 files changed

+358
-96
lines changed

8 files changed

+358
-96
lines changed

CMakeLists.txt

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -643,10 +643,17 @@ if(EXECUTORCH_BUILD_XNNPACK)
643643
endif()
644644

645645
if(EXECUTORCH_BUILD_DEVTOOLS)
646-
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
647-
ON
648-
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
649-
)
646+
if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
647+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
648+
ON
649+
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
650+
)
651+
else()
652+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
653+
OFF
654+
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
655+
)
656+
endif()
650657
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
651658
endif()
652659

backends/arm/runtime/ArmBackendEthosU.cpp

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515

1616
#include <ethosu_driver.h>
1717

18+
#if defined(ET_EVENT_TRACER_ENABLED)
19+
#include <executorch/runtime/core/event_tracer.h>
20+
#include <executorch/runtime/core/event_tracer_hooks.h>
21+
using executorch::runtime::internal::EventTracerProfileOpScope;
22+
#define EXECUTORCH_INTERNAL_PROF(NAME) \
23+
EventTracerProfileOpScope event_tracer_op_scope = \
24+
EventTracerProfileOpScope(event_tracer, NAME);
25+
#else
26+
#define EXECUTORCH_INTERNAL_PROF(NAME)
27+
#endif
28+
1829
#include <executorch/backends/arm/runtime/VelaBinStream.h>
1930
#include <executorch/runtime/backend/interface.h>
2031
#include <executorch/runtime/core/error.h>
@@ -33,6 +44,7 @@ using executorch::runtime::CompileSpec;
3344
using executorch::runtime::DelegateHandle;
3445
using executorch::runtime::Error;
3546
using executorch::runtime::EValue;
47+
using executorch::runtime::EventTracer;
3648
using executorch::runtime::FreeableBuffer;
3749
using executorch::runtime::MemoryAllocator;
3850
using executorch::runtime::Result;
@@ -109,19 +121,30 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
109121
BackendExecutionContext& context,
110122
DelegateHandle* input_handle,
111123
EValue** args) const override {
124+
EventTracer* event_tracer =
125+
context.event_tracer(); // used by EXECUTORCH_INTERNAL_PROF()
126+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()");
127+
ArmBackendExecuteCallbacks ArmBackend_execute_callbacks();
128+
112129
ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
113130
VelaHandles handles;
114131

115-
ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
116132
// Command stream - we know at this point it's aligned
117-
char* data = (char*)execution_handle->processed->data();
133+
char* data;
134+
{
135+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()processed_data");
136+
data = (char*)execution_handle->processed->data();
137+
}
118138
ET_LOG(Debug, "ArmBackend::execute %p", data);
119139

120-
// Read key sections from the vela_bin_stream
121-
if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
122-
false) {
123-
ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
124-
return Error::InvalidProgram;
140+
{
141+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()vela_bin_read()");
142+
// Read key sections from the vela_bin_stream
143+
if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
144+
false) {
145+
ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
146+
return Error::InvalidProgram;
147+
}
125148
}
126149

127150
ET_LOG(
@@ -185,6 +208,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
185208

186209
// Select a compatible copy routine
187210
if (both_char and permuted_input_shape) {
211+
EXECUTORCH_INTERNAL_PROF(
212+
"ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
188213
// permuted byte copy CHW to HWC
189214
permute_CHW_to_HWC(
190215
tensor_in.mutable_data_ptr<char>(),
@@ -193,6 +218,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
193218
tensor_in.size(2),
194219
tensor_in.size(3));
195220
} else if (both_char or both_int) {
221+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()handles.input.memcpy()");
196222
// Sizes match and elt size matches so memcpy
197223
memcpy(
198224
scratch_addr,
@@ -220,14 +246,18 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
220246
(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
221247
size_t bases_size[2] = {
222248
handles.weight_data_size, handles.scratch_data_size};
223-
int result = ethosu_invoke_v3(
224-
driver.get(),
225-
(void*)handles.cmd_data,
226-
handles.cmd_data_size,
227-
bases,
228-
bases_size,
229-
2, /* fixed array of pointers to binary interface*/
230-
nullptr);
249+
int result = 0;
250+
{
251+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()NPU");
252+
result = ethosu_invoke_v3(
253+
driver.get(),
254+
(void*)handles.cmd_data,
255+
handles.cmd_data_size,
256+
bases,
257+
bases_size,
258+
2, /* fixed array of pointers to binary interface*/
259+
nullptr);
260+
}
231261

232262
if (result != 0) {
233263
ET_LOG(
@@ -253,6 +283,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
253283
&permuted_output_shape));
254284
if (tensor_out.scalar_type() == ScalarType::Char and
255285
permuted_output_shape) {
286+
EXECUTORCH_INTERNAL_PROF(
287+
"ArmBackend::execute()handles.output.permute_HWC_to_CHW()");
288+
256289
char* output_address = (char*)output_addr;
257290
permute_HWC_to_CHW(
258291
output_address,
@@ -261,6 +294,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
261294
tensor_out.size(2),
262295
tensor_out.size(3));
263296
} else {
297+
EXECUTORCH_INTERNAL_PROF("ArmBackend::execute()handles.output.move()");
264298
for (int j = 0; j < tensor_out.numel(); j++) {
265299
if (tensor_out.scalar_type() == ScalarType::Char) {
266300
char* output_address = (char*)output_addr;

devtools/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19)
1313

1414
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1515

16+
set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
17+
1618
if(NOT CMAKE_CXX_STANDARD)
1719
set(CMAKE_CXX_STANDARD 17)
1820
endif()
1921

2022
if(NOT FLATCC_EXECUTABLE)
21-
set(FLATCC_EXECUTABLE flatcc)
23+
set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
2224
endif()
2325

2426
# Source root directory for executorch.
@@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE
6668
OFF
6769
CACHE BOOL ""
6870
)
69-
set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
71+
7072
add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)
7173

7274
# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
@@ -163,7 +165,7 @@ add_custom_command(
163165
# Note that the flatcc project actually writes its outputs into the source
164166
# tree instead of under the binary directory, and there's no way to change
165167
# that behavior.
166-
${_flatcc_source_dir}/bin/flatcc -cwr -o
168+
${FLATCC_EXECUTABLE} -cwr -o
167169
${_program_schema__include_dir}/executorch/devtools/etdump
168170
${_etdump_schema__srcs}
169171
COMMAND rm -f ${_etdump_schema_cleanup_paths}

examples/arm/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,8 @@ generate_bindings_for_kernels(
5757
gen_operators_lib(
5858
LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
5959
)
60+
61+
if(EXECUTORCH_ENABLE_EVENT_TRACER)
62+
target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
63+
target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
64+
endif()

examples/arm/executor_runner/CMakeLists.txt

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF)
1010
option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
1111
option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
1212

13-
1413
if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
1514
message(
1615
FATAL_ERROR
@@ -220,10 +219,8 @@ target_sources(
220219
# Include the target's bare-metal linker script
221220
ethosu_eval_link_options(arm_executor_runner)
222221

223-
# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
224-
# bin size as we link in a number of other symbols
225-
target_link_libraries(
226-
arm_executor_runner
222+
set(arm_executor_runner_link)
223+
list(APPEND arm_executor_runner_link
227224
extension_runner_util
228225
ethosu_target_init
229226
executorch
@@ -236,6 +233,44 @@ target_link_libraries(
236233
"-Wl,--no-whole-archive"
237234
)
238235

236+
if(EXECUTORCH_ENABLE_EVENT_TRACER)
237+
target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED)
238+
239+
add_library(etdump STATIC IMPORTED)
240+
set_property(
241+
TARGET etdump
242+
PROPERTY IMPORTED_LOCATION
243+
"${ET_BUILD_DIR_PATH}/lib/libetdump.a"
244+
)
245+
246+
if(CMAKE_BUILD_TYPE MATCHES "Debug")
247+
set(FLATCCRT_LIB flatccrt_d)
248+
else()
249+
set(FLATCCRT_LIB flatccrt)
250+
endif()
251+
252+
add_library(${FLATCCRT_LIB} STATIC IMPORTED)
253+
set_property(
254+
TARGET ${FLATCCRT_LIB}
255+
PROPERTY IMPORTED_LOCATION
256+
"${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
257+
)
258+
259+
list(APPEND arm_executor_runner_link
260+
etdump
261+
${FLATCCRT_LIB}
262+
)
263+
endif()
264+
265+
# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
266+
# bin size as we link in a number of other symbols
267+
target_link_libraries(
268+
arm_executor_runner
269+
${arm_executor_runner_link}
270+
)
271+
272+
target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map )
273+
239274
# ET headers and generated headers includes
240275
target_include_directories(
241276
arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}

0 commit comments

Comments
 (0)