From 965b6cf47a87c95110fca87ca1c8bcb4574f4715 Mon Sep 17 00:00:00 2001 From: Sebastian Larsson Date: Mon, 3 Nov 2025 11:12:50 +0100 Subject: [PATCH] Arm backend: Handle Ethos-U output layout mismatches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vela can pad or pack an inference output, so the byte layout of a tensor may not match what ExecuTorch expects. The runtime now detects those cases and strips padding and/or expand packed 4‑bit activations back into signed int8 tensors. Change-Id: I730b91b83f3793e7fba0755e4a5ed01147555cb8 Signed-off-by: Sebastian Larsson --- backends/arm/runtime/EthosUBackend.cpp | 186 +++++++++++++++++++++++-- 1 file changed, 172 insertions(+), 14 deletions(-) diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index 08589c34c69..97c2f51e0df 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -326,7 +326,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Error, "Ethos-U invocation failed error (%d)", result); return Error::InvalidProgram; } - int tensor_dim = 0, io_dim = 0; + size_t tensor_bytes_total = 0; + size_t io_bytes_total = 0; // Write outputs from scratch into EValue pointers for (int i = 0; i < handles.outputs->count; i++) { int tensor_count = 1, io_count = 1; @@ -338,23 +339,39 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { calculate_dimensions( tensor_out, &handles.outputs->io[i], &tensor_count, &io_count); - // At times the topological order of the outputs may change. - // Lets instead ensure that the sum of dimensions match. - tensor_dim = tensor_dim + tensor_count; - io_dim = io_dim + io_count; + size_t tensor_bytes = tensor_out.nbytes(); + size_t io_bytes = static_cast(io_count) * + static_cast(handles.outputs->io[i].elem_size); + + if (tensor_bytes != io_bytes) { + Error status = copy_with_layout_adjustment( + handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes); + if (status != Error::Ok) { + return status; + } + io_bytes_total += tensor_bytes; + } else { + EXECUTORCH_PROF_SCOPE( + event_tracer, "+EthosUBackend::execute()handles.output.memcpy()"); - EXECUTORCH_PROF_SCOPE( - event_tracer, "+EthosUBackend::execute()handles.output.memcpy()"); + memcpy( + tensor_out.mutable_data_ptr(), + static_cast(output_addr), + tensor_bytes); + io_bytes_total += io_bytes; + } - memcpy( - tensor_out.mutable_data_ptr(), - static_cast(output_addr), - tensor_out.nbytes()); + // At times the topological order of the outputs may change. + // Lets instead ensure that the sum of output bytes match. + tensor_bytes_total += tensor_bytes; } - if (tensor_dim != io_dim) { + if (tensor_bytes_total != io_bytes_total) { ET_LOG(Error, "Total output tensor sizes do not match"); ET_LOG( - Error, "Program expects size of %d but got %d", tensor_dim, io_dim); + Error, + "Program expects %zu bytes but got %zu", + io_bytes_total, + tensor_bytes_total); return Error::InvalidProgram; } return Error::Ok; @@ -365,6 +382,147 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { } private: + // Copies Vela output into the ExecuTorch tensor, adjusting for padding or + // packed layouts produced by the delegate. + Error copy_with_layout_adjustment( + const VelaIO& output_io, + int output_index, + const char* src, + executorch::aten::Tensor& tensor_out, + size_t tensor_bytes) const { + const int elem_size = output_io.elem_size; + if (elem_size == 0) { + ET_LOG( + Error, "Ethos-U output %d reports zero element size", output_index); + return Error::InvalidProgram; + } + + size_t chunk_count = 1; + for (int dim = 0; dim < shapeDim - 1; ++dim) { + const int vela_dim = output_io.shape[dim]; + chunk_count *= static_cast(vela_dim == 0 ? 1 : vela_dim); + } + const int last_dim = output_io.shape[shapeDim - 1]; + const size_t vela_chunk_elems = + static_cast(last_dim == 0 ? 1 : last_dim); + const size_t vela_chunk_size = + vela_chunk_elems * static_cast(elem_size); + + if (tensor_bytes % chunk_count != 0) { + ET_LOG( + Error, + "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu", + output_index, + tensor_bytes, + chunk_count); + return Error::InvalidProgram; + } + + const size_t chunk_size = tensor_bytes / chunk_count; + + // If Vela writes fewer bytes than the tensor expects we may need to + // expand 4-bit data to 8-bit. Ethos-U outputs may be + // packed 4-bit values but ExecuTorch tensors are at least 8-bit. + if (vela_chunk_size < chunk_size) { + if (chunk_size % vela_chunk_size != 0) { + ET_LOG( + Error, + "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu", + output_index, + chunk_size, + vela_chunk_size); + return Error::InvalidProgram; + } + + const size_t expand_factor = chunk_size / vela_chunk_size; + if (expand_factor == 2 && elem_size == 1 && + tensor_out.scalar_type() == ScalarType::Char) { + return unpack_chunks_4bit_to_int8( + reinterpret_cast(src), + tensor_out.mutable_data_ptr(), + chunk_count, + chunk_size, + vela_chunk_size); + } + + ET_LOG( + Error, + "Ethos-U output %d expansion factor %zu with element size %d not supported", + output_index, + expand_factor, + elem_size); + return Error::InvalidProgram; + } + + return strip_delegate_padding( + src, + tensor_out.mutable_data_ptr(), + chunk_count, + chunk_size, + vela_chunk_size); + } + + Error unpack_chunks_4bit_to_int8( + const uint8_t* src, + int8_t* dest, + size_t chunk_count, + size_t dest_chunk_size, + size_t src_chunk_size) const { + const uint8_t* chunk_src = src; + int8_t* chunk_dest = dest; + for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { + unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size); + chunk_src += src_chunk_size; + chunk_dest += dest_chunk_size; + } + return Error::Ok; + } + + void unpack_single_chunk_4bit_to_int8( + const uint8_t* src, + int8_t* dest, + size_t chunk_size) const { + for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) { + const uint8_t packed = src[byte_idx]; + int8_t low = static_cast(packed & 0x0F); + int8_t high = static_cast((packed >> 4) & 0x0F); + if (low >= 8) { + low -= 16; + } + if (high >= 8) { + high -= 16; + } + dest[2 * byte_idx] = low; + dest[2 * byte_idx + 1] = high; + } + } + + Error strip_delegate_padding( + const char* src, + char* dest, + size_t chunk_count, + size_t dest_chunk_size, + size_t src_chunk_size) const { + if (dest_chunk_size > src_chunk_size) { + ET_LOG( + Error, + "dest chunk size %zu must not exceed src chunk size %zu", + dest_chunk_size, + src_chunk_size); + return Error::InvalidProgram; + } + if (src == nullptr || dest == nullptr) { + ET_LOG(Error, "Ethos-U padded copy received null buffer"); + return Error::InvalidState; + } + for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { + memcpy(dest, src, dest_chunk_size); + src += src_chunk_size; + dest += dest_chunk_size; + } + return Error::Ok; + } + void calculate_dimensions( const executorch::aten::Tensor tensor, VelaIO* io, @@ -389,4 +547,4 @@ static auto registered = register_backend(backend_id); } // namespace arm } // namespace backends -} // namespace executorch \ No newline at end of file +} // namespace executorch