1515
1616#include < ethosu_driver.h>
1717
18+ #if defined(ET_EVENT_TRACER_ENABLED)
19+ #include < executorch/runtime/core/event_tracer.h>
20+ #include < executorch/runtime/core/event_tracer_hooks.h>
21+ using executorch::runtime::EventTracer;
22+ using executorch::runtime::EventTracerEntry;
23+
24+ class EventTraceScope {
25+ public:
26+ EventTraceScope (EventTracer* event_tracer_, const char * name) {
27+ event_tracer = event_tracer_;
28+ event_tracer_entry_scope = event_tracer->start_profiling (name);
29+ }
30+ ~EventTraceScope () {
31+ event_tracer->end_profiling (event_tracer_entry_scope);
32+ }
33+
34+ private:
35+ EventTracer* event_tracer;
36+ EventTracerEntry event_tracer_entry_scope;
37+ };
38+ #define EXECUTORCH_PROF_SCOPE (EVENTTRACER, NAME ) \
39+ EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
40+ #define EXECUTORCH_PROF_START (EVENTTRACER, SCOPE, NAME ) \
41+ SCOPE = EVENTTRACER->start_profiling (NAME)
42+ #define EXECUTORCH_PROF_END (EVENTTRACER, SCOPE ) \
43+ EVENTTRACER->end_profiling (SCOPE)
44+
45+ #else
46+ #define EXECUTORCH_PROF_SCOPE (EVENTTRACER, NAME )
47+ #define EXECUTORCH_PROF_START (EVENTTRACER, SCOPE, NAME )
48+ #define EXECUTORCH_PROF_END (EVENTTRACER, SCOPE )
49+ #endif
50+
1851#include < executorch/backends/arm/runtime/VelaBinStream.h>
1952#include < executorch/runtime/backend/interface.h>
2053#include < executorch/runtime/core/error.h>
@@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
109142 BackendExecutionContext& context,
110143 DelegateHandle* input_handle,
111144 EValue** args) const override {
145+ #if defined(ET_EVENT_TRACER_ENABLED)
146+ EventTracer* event_tracer = context.event_tracer ();
147+ EventTracerEntry event_tracer_local_scope;
148+ #endif
149+
150+ EXECUTORCH_PROF_SCOPE (event_tracer, " ArmBackend::execute()" );
151+ ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
152+
112153 ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
113154 VelaHandles handles;
114155
115- ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
116156 // Command stream - we know at this point it's aligned
157+ EXECUTORCH_PROF_START (
158+ event_tracer,
159+ event_tracer_local_scope,
160+ " +ArmBackend::execute()processed_data" );
117161 char * data = (char *)execution_handle->processed ->data ();
162+ EXECUTORCH_PROF_END (event_tracer, event_tracer_local_scope);
163+
118164 ET_LOG (Debug, " ArmBackend::execute %p" , data);
119165
166+ EXECUTORCH_PROF_START (
167+ event_tracer,
168+ event_tracer_local_scope,
169+ " +ArmBackend::execute()vela_bin_read()" );
120170 // Read key sections from the vela_bin_stream
121171 if (vela_bin_read (data, &handles, execution_handle->processed ->size ()) ==
122172 false ) {
123173 ET_LOG (Error, " ArmBackend::vela_read: error, invalid binary layout" );
124174 return Error::InvalidProgram;
125175 }
176+ EXECUTORCH_PROF_END (event_tracer, event_tracer_local_scope);
126177
127178 ET_LOG (
128179 Debug,
@@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
186237
187238 // Select a compatible copy routine
188239 if (both_char and permuted_input_shape) {
240+ EXECUTORCH_PROF_SCOPE (
241+ event_tracer,
242+ " +ArmBackend::execute()handles.input.permute_CHW_to_HWC()" );
189243 // permuted byte copy CHW to HWC
190244 permute_CHW_to_HWC (
191245 tensor_in.mutable_data_ptr <char >(),
@@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
194248 tensor_in.size (2 ),
195249 tensor_in.size (3 ));
196250 } else if (both_char or both_int) {
251+ EXECUTORCH_PROF_SCOPE (
252+ event_tracer, " +ArmBackend::execute()handles.input.memcpy()" );
197253 // Sizes match and elt size matches so memcpy
198254 memcpy (
199255 scratch_addr,
@@ -234,14 +290,18 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
234290 (uint64_t )handles.weight_data , (uint64_t )handles.scratch_data };
235291 size_t bases_size[2 ] = {
236292 handles.weight_data_size , handles.scratch_data_size };
237- int result = ethosu_invoke_v3 (
293+ int result = 0 ;
294+ EXECUTORCH_PROF_START (
295+ event_tracer, event_tracer_local_scope, " +ArmBackend::execute()NPU" );
296+ result = ethosu_invoke_v3 (
238297 driver.get (),
239298 (void *)handles.cmd_data ,
240299 handles.cmd_data_size ,
241300 bases,
242301 bases_size,
243302 2 , /* fixed array of pointers to binary interface*/
244303 nullptr );
304+ EXECUTORCH_PROF_END (event_tracer, event_tracer_local_scope);
245305
246306 if (result != 0 ) {
247307 ET_LOG (
@@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
277337 &permuted_output_shape));
278338 if (tensor_out.scalar_type () == ScalarType::Char and
279339 permuted_output_shape) {
340+ EXECUTORCH_PROF_SCOPE (
341+ event_tracer,
342+ " +ArmBackend::execute()handles.output.permute_HWC_to_CHW()" );
343+
280344 char * output_address = (char *)output_addr;
281345 permute_HWC_to_CHW (
282346 output_address,
@@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
285349 tensor_out.size (2 ),
286350 tensor_out.size (3 ));
287351 } else {
352+ EXECUTORCH_PROF_SCOPE (
353+ event_tracer, " +ArmBackend::execute()handles.output.move()" );
288354 for (int j = 0 ; j < tensor_out.numel (); j++) {
289355 if (tensor_out.scalar_type () == ScalarType::Char) {
290356 char * output_address = (char *)output_addr;
0 commit comments