Arm backend: Allocate the scratch buffer in an array rather than in the pte

gggekov · gggekov · commit 2a35a48aa870 · 2025-05-06T13:44:16.000+01:00
This change lowers the size of the pte and allows you to allocate the
scratch buffer in an array, usually in the SRAM, for more efficient
memory usage on a MCU

Change-Id: I04cf9de49a6116141d402b9ad5ca4f21e2025236
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             np_path = os.path.join(tmpdir, "output", "out_vela.npz")
         else:
             np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
-        blocks = b""
 
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Construct our modified output_blocks with data in a form easily
             # digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             if not isinstance(data["scratch_shape"][0], np.int64):
                 raise RuntimeError("Expected scratch to be int64")
             block_length = int(data["scratch_shape"][0])
-            bin_blocks["scratch_data"] = b"\x00" * block_length
+            bin_blocks["scratch_size"] = struct.pack("<I", block_length)
 
             # Capture inputs and outputs
             bin_blocks["inputs"] = vela_bin_pack_io("input", data)
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
@@ -181,14 +181,22 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     }
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
+    MemoryAllocator* temp_allocator = context.get_temp_allocator();
+
+    // Use a temporary allocator for the intermediate tensors of the
+    // computation. The allocator is released in runtime/executor/method.cpp at
+    // the end of the execution of the Ethos-U custom delegate
+    char* ethosu_scratch =
+        static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
+
     ET_LOG(
         Debug,
         "EthosUBackend::execute: Running program data:\n  cmd %p %zu\n  weight %p %zu\n  scratch %p %zu\n",
         handles.cmd_data,
         handles.cmd_data_size,
         handles.weight_data,
         handles.weight_data_size,
-        handles.scratch_data,
+        ethosu_scratch,
         handles.scratch_data_size);
 
     // Write argument values (from EValue tensor) into Ethos-U scratch
@@ -197,7 +205,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     for (int i = 0; i < handles.inputs->count; i++) {
       auto tensor_count = 1, io_count = 1;
       auto tensor_in = args[i]->toTensor();
-      char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset;
+      char* scratch_addr = ethosu_scratch + handles.inputs->io[i].offset;
 
       // We accept:
       bool supported = 0;
@@ -294,11 +302,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Ethos-U low level driver expected order for Ethos U-55, we have
     // constant weight data, then scratch (which contains input and output)
     // scratch is written above in this function.
+
     uint64_t bases[2] = {
         static_cast<uint64_t>(
             reinterpret_cast<uintptr_t>((handles.weight_data))),
-        static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>((handles.scratch_data)))};
+        static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch))};
     size_t bases_size[2] = {
         handles.weight_data_size, handles.scratch_data_size};
     int result = 0;
@@ -325,8 +333,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Write outputs from scratch into EValue pointers
     for (int i = 0; i < handles.outputs->count; i++) {
       int tensor_count = 1, io_count = 1;
-      const char* output_addr =
-          handles.scratch_data + handles.outputs->io[i].offset;
+      const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset;
       // Process input EValue into scratch
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Arm Limited and/or its affiliates.
+ * Copyright 2023, 2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -71,9 +71,10 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
     } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
       handles->weight_data = b->data;
       handles->weight_data_size = b->size;
-    } else if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
-      handles->scratch_data = b->data;
-      handles->scratch_data_size = b->size;
+    } else if (!strncmp(b->name, "scratch_size", strlen("scratch_size"))) {
+      const uint32_t* scratch_size_ptr =
+          reinterpret_cast<const uint32_t*>(b->data);
+      handles->scratch_data_size = *scratch_size_ptr;
     } else if (!strncmp(b->name, "inputs", strlen("inputs"))) {
       handles->inputs = (VelaIOs*)b->data;
     } else if (!strncmp(b->name, "outputs", strlen("outputs"))) {
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
@@ -37,28 +37,28 @@ def __init__(self):
         # (t, c, n, s) = (6, 96, 1, 1)
         # 1. 1x1 CONV2d + ReLU6 (Pointwise)
         self.pointwise_conv2d = torch.nn.Conv2d(
-            in_channels=64, out_channels=384, kernel_size=1, stride=1, groups=1
-        )  ## (1, 384, 81, 81)
-        self.batch_norm2d_16 = torch.nn.BatchNorm2d(384, affine=False)
+            in_channels=32, out_channels=128, kernel_size=1, stride=1, groups=1
+        )  ## (1, 128, 81, 81)
+        self.batch_norm2d_16 = torch.nn.BatchNorm2d(128, affine=False)
         self.relu6 = torch.nn.ReLU6()
 
         # 2. 3x3 DepthwiseConv2d + ReLu6
         self.depthwise_conv2d = torch.nn.Conv2d(
-            in_channels=384,
-            out_channels=384,
+            in_channels=128,
+            out_channels=128,
             kernel_size=3,
             padding=1,
             stride=1,
-            groups=384,
-        )  ## (1, 384, H, W)
+            groups=128,
+        )  ## (1, 128, H, W)
 
         # 3. Linear 1x1 Conv2d
         self.pointwise_conv2d_linear = torch.nn.Conv2d(
-            in_channels=384, out_channels=64, kernel_size=1, stride=1, groups=1
-        )  ## (1, 64, 81, 81)
+            in_channels=128, out_channels=32, kernel_size=1, stride=1, groups=1
+        )  ## (1, 32, 81, 81)
 
     def get_inputs(self) -> Tuple[torch.Tensor]:
-        return (torch.randn(1, 64, 81, 81),)
+        return (torch.randn(1, 32, 81, 81),)
 
     def forward(self, x):
         input = x
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -128,15 +128,22 @@ const float et_rtol = 0.01;
  * The temp_allocation_pool is used for allocating temporary data during kernel
  * or delegate execution. This will be reset after each kernel or delegate call.
  * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
- * a better fit
+ * a better fit.
+ *
+ * The Corstone-300 and Corstone-320 platforms have 2MB of SRAM, we allocate
+ * temporary pool that can fully utilize the memory subsystem of the platform.
+ * If your NN requires more than 2MB of SRAM for the peak intermediate tensor
+ * (Total SRAM Used in the AoT Vela summary), consider compiling your model with
+ * the --optimise Size CLI option in the Vela compile spec to lower the SRAM
+ * consumption of the model.
  */
 #if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
-#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
+#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (2 * 1024 * 1024)
 #endif
 const size_t temp_allocation_pool_size =
     ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((
-    section("input_data_sec"),
+    section(".bss.ethosu_scratch"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 
 void et_pal_init(void) {
@@ -207,7 +214,7 @@ namespace {
 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
  public:
   ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
-      : MemoryAllocator(size, base_address), used_(0) {}
+      : MemoryAllocator(size, base_address), used_(0), peak_used_(0) {}
 
   void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
     void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
@@ -222,6 +229,8 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
       } else {
         used_ = (used_ | (alignment - 1)) + 1 + size;
       }
+      if (used_ > peak_used_)
+        peak_used_ = used_;
     }
     return ret;
   }
@@ -231,13 +240,25 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
     return used_;
   }
 
+  // Returns the peak memory usage of the allocator's memory buffer
+  // Peak usage is useful when doing multiple allocations & resets
+  size_t peak_used() const {
+    return peak_used_;
+  }
+
   // Returns the free size of the allocator's memory buffer.
   size_t free_size() const {
     return executorch::runtime::MemoryAllocator::size() - used_;
   }
 
+  void reset() {
+    executorch::runtime::MemoryAllocator::reset();
+    used_ = 0;
+  }
+
  private:
   size_t used_;
+  size_t peak_used_;
 };
 
 Result<BufferCleanup> prepare_input_tensors(
@@ -682,11 +703,11 @@ int main(int argc, const char* argv[]) {
   if (temp_allocator.size() > 0) {
     ET_LOG(
         Info,
-        "temp_allocator_used:       %zu / %zu free: %zu ( used: %zu %% ) ",
-        temp_allocator.used_size(),
+        "peak_temp_allocator:       %zu / %zu free: %zu ( used: %zu %% ) ",
+        temp_allocator.peak_used(),
         temp_allocator.size(),
         temp_allocator.free_size(),
-        100 * temp_allocator.used_size() / temp_allocator.size());
+        100 * temp_allocator.peak_used() / temp_allocator.size());
   }
 
   if (status != Error::Ok) {