Skip to content

Commit 2a35a48

Browse files
committed
Arm backend: Allocate the scratch buffer in an array rather than in the pte
This change lowers the size of the pte and allows you to allocate the scratch buffer in an array, usually in the SRAM, for more efficient memory usage on a MCU Change-Id: I04cf9de49a6116141d402b9ad5ca4f21e2025236
1 parent f4875cc commit 2a35a48

File tree

5 files changed

+58
-29
lines changed

5 files changed

+58
-29
lines changed

backends/arm/arm_vela.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
7373
np_path = os.path.join(tmpdir, "output", "out_vela.npz")
7474
else:
7575
np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
76-
blocks = b""
7776

77+
blocks = b""
7878
with np.load(np_path, allow_pickle=False) as data:
7979
# Construct our modified output_blocks with data in a form easily
8080
# digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
9292
if not isinstance(data["scratch_shape"][0], np.int64):
9393
raise RuntimeError("Expected scratch to be int64")
9494
block_length = int(data["scratch_shape"][0])
95-
bin_blocks["scratch_data"] = b"\x00" * block_length
95+
bin_blocks["scratch_size"] = struct.pack("<I", block_length)
9696

9797
# Capture inputs and outputs
9898
bin_blocks["inputs"] = vela_bin_pack_io("input", data)

backends/arm/runtime/EthosUBackend.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,22 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
181181
}
182182
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
183183

184+
MemoryAllocator* temp_allocator = context.get_temp_allocator();
185+
186+
// Use a temporary allocator for the intermediate tensors of the
187+
// computation. The allocator is released in runtime/executor/method.cpp at
188+
// the end of the execution of the Ethos-U custom delegate
189+
char* ethosu_scratch =
190+
static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
191+
184192
ET_LOG(
185193
Debug,
186194
"EthosUBackend::execute: Running program data:\n cmd %p %zu\n weight %p %zu\n scratch %p %zu\n",
187195
handles.cmd_data,
188196
handles.cmd_data_size,
189197
handles.weight_data,
190198
handles.weight_data_size,
191-
handles.scratch_data,
199+
ethosu_scratch,
192200
handles.scratch_data_size);
193201

194202
// Write argument values (from EValue tensor) into Ethos-U scratch
@@ -197,7 +205,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
197205
for (int i = 0; i < handles.inputs->count; i++) {
198206
auto tensor_count = 1, io_count = 1;
199207
auto tensor_in = args[i]->toTensor();
200-
char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset;
208+
char* scratch_addr = ethosu_scratch + handles.inputs->io[i].offset;
201209

202210
// We accept:
203211
bool supported = 0;
@@ -294,11 +302,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
294302
// Ethos-U low level driver expected order for Ethos U-55, we have
295303
// constant weight data, then scratch (which contains input and output)
296304
// scratch is written above in this function.
305+
297306
uint64_t bases[2] = {
298307
static_cast<uint64_t>(
299308
reinterpret_cast<uintptr_t>((handles.weight_data))),
300-
static_cast<uint64_t>(
301-
reinterpret_cast<uintptr_t>((handles.scratch_data)))};
309+
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch))};
302310
size_t bases_size[2] = {
303311
handles.weight_data_size, handles.scratch_data_size};
304312
int result = 0;
@@ -325,8 +333,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
325333
// Write outputs from scratch into EValue pointers
326334
for (int i = 0; i < handles.outputs->count; i++) {
327335
int tensor_count = 1, io_count = 1;
328-
const char* output_addr =
329-
handles.scratch_data + handles.outputs->io[i].offset;
336+
const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset;
330337
// Process input EValue into scratch
331338
// Outputs are in the index immediately after inputs
332339
auto tensor_out = args[handles.inputs->count + i]->toTensor();

backends/arm/runtime/VelaBinStream.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023 Arm Limited and/or its affiliates.
2+
* Copyright 2023, 2025 Arm Limited and/or its affiliates.
33
*
44
* This source code is licensed under the BSD-style license found in the
55
* LICENSE file in the root directory of this source tree.
@@ -71,9 +71,10 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
7171
} else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
7272
handles->weight_data = b->data;
7373
handles->weight_data_size = b->size;
74-
} else if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
75-
handles->scratch_data = b->data;
76-
handles->scratch_data_size = b->size;
74+
} else if (!strncmp(b->name, "scratch_size", strlen("scratch_size"))) {
75+
const uint32_t* scratch_size_ptr =
76+
reinterpret_cast<const uint32_t*>(b->data);
77+
handles->scratch_data_size = *scratch_size_ptr;
7778
} else if (!strncmp(b->name, "inputs", strlen("inputs"))) {
7879
handles->inputs = (VelaIOs*)b->data;
7980
} else if (!strncmp(b->name, "outputs", strlen("outputs"))) {

backends/arm/test/ops/test_conv_combos.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,28 +37,28 @@ def __init__(self):
3737
# (t, c, n, s) = (6, 96, 1, 1)
3838
# 1. 1x1 CONV2d + ReLU6 (Pointwise)
3939
self.pointwise_conv2d = torch.nn.Conv2d(
40-
in_channels=64, out_channels=384, kernel_size=1, stride=1, groups=1
41-
) ## (1, 384, 81, 81)
42-
self.batch_norm2d_16 = torch.nn.BatchNorm2d(384, affine=False)
40+
in_channels=32, out_channels=128, kernel_size=1, stride=1, groups=1
41+
) ## (1, 128, 81, 81)
42+
self.batch_norm2d_16 = torch.nn.BatchNorm2d(128, affine=False)
4343
self.relu6 = torch.nn.ReLU6()
4444

4545
# 2. 3x3 DepthwiseConv2d + ReLu6
4646
self.depthwise_conv2d = torch.nn.Conv2d(
47-
in_channels=384,
48-
out_channels=384,
47+
in_channels=128,
48+
out_channels=128,
4949
kernel_size=3,
5050
padding=1,
5151
stride=1,
52-
groups=384,
53-
) ## (1, 384, H, W)
52+
groups=128,
53+
) ## (1, 128, H, W)
5454

5555
# 3. Linear 1x1 Conv2d
5656
self.pointwise_conv2d_linear = torch.nn.Conv2d(
57-
in_channels=384, out_channels=64, kernel_size=1, stride=1, groups=1
58-
) ## (1, 64, 81, 81)
57+
in_channels=128, out_channels=32, kernel_size=1, stride=1, groups=1
58+
) ## (1, 32, 81, 81)
5959

6060
def get_inputs(self) -> Tuple[torch.Tensor]:
61-
return (torch.randn(1, 64, 81, 81),)
61+
return (torch.randn(1, 32, 81, 81),)
6262

6363
def forward(self, x):
6464
input = x

examples/arm/executor_runner/arm_executor_runner.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,22 @@ const float et_rtol = 0.01;
128128
* The temp_allocation_pool is used for allocating temporary data during kernel
129129
* or delegate execution. This will be reset after each kernel or delegate call.
130130
* Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
131-
* a better fit
131+
* a better fit.
132+
*
133+
* The Corstone-300 and Corstone-320 platforms have 2MB of SRAM, we allocate
134+
* temporary pool that can fully utilize the memory subsystem of the platform.
135+
* If your NN requires more than 2MB of SRAM for the peak intermediate tensor
136+
* (Total SRAM Used in the AoT Vela summary), consider compiling your model with
137+
* the --optimise Size CLI option in the Vela compile spec to lower the SRAM
138+
* consumption of the model.
132139
*/
133140
#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
134-
#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
141+
#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (2 * 1024 * 1024)
135142
#endif
136143
const size_t temp_allocation_pool_size =
137144
ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
138145
unsigned char __attribute__((
139-
section("input_data_sec"),
146+
section(".bss.ethosu_scratch"),
140147
aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
141148

142149
void et_pal_init(void) {
@@ -207,7 +214,7 @@ namespace {
207214
class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
208215
public:
209216
ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
210-
: MemoryAllocator(size, base_address), used_(0) {}
217+
: MemoryAllocator(size, base_address), used_(0), peak_used_(0) {}
211218

212219
void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
213220
void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
@@ -222,6 +229,8 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
222229
} else {
223230
used_ = (used_ | (alignment - 1)) + 1 + size;
224231
}
232+
if (used_ > peak_used_)
233+
peak_used_ = used_;
225234
}
226235
return ret;
227236
}
@@ -231,13 +240,25 @@ class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
231240
return used_;
232241
}
233242

243+
// Returns the peak memory usage of the allocator's memory buffer
244+
// Peak usage is useful when doing multiple allocations & resets
245+
size_t peak_used() const {
246+
return peak_used_;
247+
}
248+
234249
// Returns the free size of the allocator's memory buffer.
235250
size_t free_size() const {
236251
return executorch::runtime::MemoryAllocator::size() - used_;
237252
}
238253

254+
void reset() {
255+
executorch::runtime::MemoryAllocator::reset();
256+
used_ = 0;
257+
}
258+
239259
private:
240260
size_t used_;
261+
size_t peak_used_;
241262
};
242263

243264
Result<BufferCleanup> prepare_input_tensors(
@@ -682,11 +703,11 @@ int main(int argc, const char* argv[]) {
682703
if (temp_allocator.size() > 0) {
683704
ET_LOG(
684705
Info,
685-
"temp_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
686-
temp_allocator.used_size(),
706+
"peak_temp_allocator: %zu / %zu free: %zu ( used: %zu %% ) ",
707+
temp_allocator.peak_used(),
687708
temp_allocator.size(),
688709
temp_allocator.free_size(),
689-
100 * temp_allocator.used_size() / temp_allocator.size());
710+
100 * temp_allocator.peak_used() / temp_allocator.size());
690711
}
691712

692713
if (status != Error::Ok) {

0 commit comments

Comments
 (0)