pytorch · zingo · Jun 8, 2025 · May 20, 2025
@@ -84,6 +84,8 @@ typedef struct {
 extern "C" {
 void __attribute__((weak)) EthosUBackend_execute_begin() {}
 void __attribute__((weak)) EthosUBackend_execute_end() {}
+__attribute__((weak)) unsigned char* ethosu_fast_scratch = nullptr;
+__attribute__((weak)) size_t ethosu_fast_scratch_size = 0;
 }
 
 class EthosUBackendExecuteCallbacks {
@@ -198,8 +200,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         handles.weight_data_size,
         ethosu_scratch,
         handles.scratch_data_size,
-        nullptr,
-        0);
+        ethosu_fast_scratch,
+        ethosu_fast_scratch_size);
 
     // Write argument values (from EValue tensor) into Ethos-U scratch
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
@@ -309,9 +311,12 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         static_cast<uint64_t>(
             reinterpret_cast<uintptr_t>((handles.weight_data))),
         static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
-        0};
+        static_cast<uint64_t>(
+            reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
     size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
-        handles.weight_data_size, handles.scratch_data_size, 0};
+        handles.weight_data_size,
+        handles.scratch_data_size,
+        ethosu_fast_scratch_size};
     int result = 0;
     EXECUTORCH_PROF_START(
         event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU");
@@ -321,7 +326,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         handles.cmd_data_size,
         bases,
         bases_size,
-        3, /* fixed array of pointers to binary interface*/
+        ETHOSU_NUM_BASE_ADDRS, /* fixed array of pointers to binary interface*/
         nullptr);
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 

@@ -103,7 +103,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Sram_Only"
+        memory_mode="Dedicated_Sram_384KB"
     fi
 fi
 

@@ -211,10 +211,7 @@ test_models_ethos-u85() { # End to End model tests using model_test.py
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-512 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=w2l --extra_flags="-DET_ATOL=0.01 -DET_RTOL=0.01"
-    # Temporarily not test inception_v4 on Ethos-U85. To support inception_v4 properly on Ethos-U85, we need to run the model in Dedicated_Sram memory mode with
-    # 384KB(or another amount lower than 2MB) of SRAM passed as fast scratch area. The PR adding support for Dedicated_Sram(https://github.com/pytorch/executorch/pull/10714) 
-    # was reverted due to a change required in an internal variant of the examples/arm/executor_runner/arm_executor_runner.cpp
-    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
 
     echo "${TEST_SUITE_NAME}: PASS"
     }

@@ -81,7 +81,7 @@ def get_args():
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
-            args.memory_mode = "Sram_Only"
+            args.memory_mode = "Dedicated_Sram_384KB"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 

@@ -145,23 +145,15 @@ const size_t temp_allocation_pool_size =
 unsigned char __attribute__((
     section(".bss.tensor_arena"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
-
-namespace executorch {
-namespace backends {
-namespace arm {
 #if defined(ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+extern "C" {
 size_t ethosu_fast_scratch_size =
     ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((section(".bss.ethosu_scratch"), aligned(16)))
 dedicated_sram[ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE];
 unsigned char* ethosu_fast_scratch = dedicated_sram;
-#else
-size_t ethosu_fast_scratch_size = 0;
-unsigned char* ethosu_fast_scratch = nullptr;
+}
 #endif
-} // namespace arm
-} // namespace backends
-} // namespace executorch
 
 void et_pal_init(void) {
   // Enable ARM PMU Clock

@@ -110,7 +110,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Sram_Only"
+        memory_mode="Dedicated_Sram_384KB"
     fi
 fi