Skip to content

Commit 9ade5b0

Browse files
Sebastian-Larssonabhinaykukkadapu
authored andcommitted
Arm backend: Handle Ethos-U output layout mismatches (pytorch#15588)
Vela can pad or pack an inference output, so the byte layout of a tensor may not match what ExecuTorch expects. The runtime now detects those cases and strips padding and/or expand packed 4‑bit activations back into signed int8 tensors. Signed-off-by: Sebastian Larsson <[email protected]>
1 parent b45cc79 commit 9ade5b0

File tree

1 file changed

+172
-14
lines changed

1 file changed

+172
-14
lines changed

backends/arm/runtime/EthosUBackend.cpp

Lines changed: 172 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
326326
ET_LOG(Error, "Ethos-U invocation failed error (%d)", result);
327327
return Error::InvalidProgram;
328328
}
329-
int tensor_dim = 0, io_dim = 0;
329+
size_t tensor_bytes_total = 0;
330+
size_t io_bytes_total = 0;
330331
// Write outputs from scratch into EValue pointers
331332
for (int i = 0; i < handles.outputs->count; i++) {
332333
int tensor_count = 1, io_count = 1;
@@ -338,23 +339,39 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
338339
calculate_dimensions(
339340
tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
340341

341-
// At times the topological order of the outputs may change.
342-
// Lets instead ensure that the sum of dimensions match.
343-
tensor_dim = tensor_dim + tensor_count;
344-
io_dim = io_dim + io_count;
342+
size_t tensor_bytes = tensor_out.nbytes();
343+
size_t io_bytes = static_cast<size_t>(io_count) *
344+
static_cast<size_t>(handles.outputs->io[i].elem_size);
345+
346+
if (tensor_bytes != io_bytes) {
347+
Error status = copy_with_layout_adjustment(
348+
handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes);
349+
if (status != Error::Ok) {
350+
return status;
351+
}
352+
io_bytes_total += tensor_bytes;
353+
} else {
354+
EXECUTORCH_PROF_SCOPE(
355+
event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
345356

346-
EXECUTORCH_PROF_SCOPE(
347-
event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
357+
memcpy(
358+
tensor_out.mutable_data_ptr<char>(),
359+
static_cast<const char*>(output_addr),
360+
tensor_bytes);
361+
io_bytes_total += io_bytes;
362+
}
348363

349-
memcpy(
350-
tensor_out.mutable_data_ptr<char>(),
351-
static_cast<const char*>(output_addr),
352-
tensor_out.nbytes());
364+
// At times the topological order of the outputs may change.
365+
// Lets instead ensure that the sum of output bytes match.
366+
tensor_bytes_total += tensor_bytes;
353367
}
354-
if (tensor_dim != io_dim) {
368+
if (tensor_bytes_total != io_bytes_total) {
355369
ET_LOG(Error, "Total output tensor sizes do not match");
356370
ET_LOG(
357-
Error, "Program expects size of %d but got %d", tensor_dim, io_dim);
371+
Error,
372+
"Program expects %zu bytes but got %zu",
373+
io_bytes_total,
374+
tensor_bytes_total);
358375
return Error::InvalidProgram;
359376
}
360377
return Error::Ok;
@@ -365,6 +382,147 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
365382
}
366383

367384
private:
385+
// Copies Vela output into the ExecuTorch tensor, adjusting for padding or
386+
// packed layouts produced by the delegate.
387+
Error copy_with_layout_adjustment(
388+
const VelaIO& output_io,
389+
int output_index,
390+
const char* src,
391+
executorch::aten::Tensor& tensor_out,
392+
size_t tensor_bytes) const {
393+
const int elem_size = output_io.elem_size;
394+
if (elem_size == 0) {
395+
ET_LOG(
396+
Error, "Ethos-U output %d reports zero element size", output_index);
397+
return Error::InvalidProgram;
398+
}
399+
400+
size_t chunk_count = 1;
401+
for (int dim = 0; dim < shapeDim - 1; ++dim) {
402+
const int vela_dim = output_io.shape[dim];
403+
chunk_count *= static_cast<size_t>(vela_dim == 0 ? 1 : vela_dim);
404+
}
405+
const int last_dim = output_io.shape[shapeDim - 1];
406+
const size_t vela_chunk_elems =
407+
static_cast<size_t>(last_dim == 0 ? 1 : last_dim);
408+
const size_t vela_chunk_size =
409+
vela_chunk_elems * static_cast<size_t>(elem_size);
410+
411+
if (tensor_bytes % chunk_count != 0) {
412+
ET_LOG(
413+
Error,
414+
"Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu",
415+
output_index,
416+
tensor_bytes,
417+
chunk_count);
418+
return Error::InvalidProgram;
419+
}
420+
421+
const size_t chunk_size = tensor_bytes / chunk_count;
422+
423+
// If Vela writes fewer bytes than the tensor expects we may need to
424+
// expand 4-bit data to 8-bit. Ethos-U outputs may be
425+
// packed 4-bit values but ExecuTorch tensors are at least 8-bit.
426+
if (vela_chunk_size < chunk_size) {
427+
if (chunk_size % vela_chunk_size != 0) {
428+
ET_LOG(
429+
Error,
430+
"Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu",
431+
output_index,
432+
chunk_size,
433+
vela_chunk_size);
434+
return Error::InvalidProgram;
435+
}
436+
437+
const size_t expand_factor = chunk_size / vela_chunk_size;
438+
if (expand_factor == 2 && elem_size == 1 &&
439+
tensor_out.scalar_type() == ScalarType::Char) {
440+
return unpack_chunks_4bit_to_int8(
441+
reinterpret_cast<const uint8_t*>(src),
442+
tensor_out.mutable_data_ptr<int8_t>(),
443+
chunk_count,
444+
chunk_size,
445+
vela_chunk_size);
446+
}
447+
448+
ET_LOG(
449+
Error,
450+
"Ethos-U output %d expansion factor %zu with element size %d not supported",
451+
output_index,
452+
expand_factor,
453+
elem_size);
454+
return Error::InvalidProgram;
455+
}
456+
457+
return strip_delegate_padding(
458+
src,
459+
tensor_out.mutable_data_ptr<char>(),
460+
chunk_count,
461+
chunk_size,
462+
vela_chunk_size);
463+
}
464+
465+
Error unpack_chunks_4bit_to_int8(
466+
const uint8_t* src,
467+
int8_t* dest,
468+
size_t chunk_count,
469+
size_t dest_chunk_size,
470+
size_t src_chunk_size) const {
471+
const uint8_t* chunk_src = src;
472+
int8_t* chunk_dest = dest;
473+
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
474+
unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size);
475+
chunk_src += src_chunk_size;
476+
chunk_dest += dest_chunk_size;
477+
}
478+
return Error::Ok;
479+
}
480+
481+
void unpack_single_chunk_4bit_to_int8(
482+
const uint8_t* src,
483+
int8_t* dest,
484+
size_t chunk_size) const {
485+
for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) {
486+
const uint8_t packed = src[byte_idx];
487+
int8_t low = static_cast<int8_t>(packed & 0x0F);
488+
int8_t high = static_cast<int8_t>((packed >> 4) & 0x0F);
489+
if (low >= 8) {
490+
low -= 16;
491+
}
492+
if (high >= 8) {
493+
high -= 16;
494+
}
495+
dest[2 * byte_idx] = low;
496+
dest[2 * byte_idx + 1] = high;
497+
}
498+
}
499+
500+
Error strip_delegate_padding(
501+
const char* src,
502+
char* dest,
503+
size_t chunk_count,
504+
size_t dest_chunk_size,
505+
size_t src_chunk_size) const {
506+
if (dest_chunk_size > src_chunk_size) {
507+
ET_LOG(
508+
Error,
509+
"dest chunk size %zu must not exceed src chunk size %zu",
510+
dest_chunk_size,
511+
src_chunk_size);
512+
return Error::InvalidProgram;
513+
}
514+
if (src == nullptr || dest == nullptr) {
515+
ET_LOG(Error, "Ethos-U padded copy received null buffer");
516+
return Error::InvalidState;
517+
}
518+
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
519+
memcpy(dest, src, dest_chunk_size);
520+
src += src_chunk_size;
521+
dest += dest_chunk_size;
522+
}
523+
return Error::Ok;
524+
}
525+
368526
void calculate_dimensions(
369527
const executorch::aten::Tensor tensor,
370528
VelaIO* io,
@@ -389,4 +547,4 @@ static auto registered = register_backend(backend_id);
389547

390548
} // namespace arm
391549
} // namespace backends
392-
} // namespace executorch
550+
} // namespace executorch

0 commit comments

Comments
 (0)