Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 172 additions & 14 deletions backends/arm/runtime/EthosUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
ET_LOG(Error, "Ethos-U invocation failed error (%d)", result);
return Error::InvalidProgram;
}
int tensor_dim = 0, io_dim = 0;
size_t tensor_bytes_total = 0;
size_t io_bytes_total = 0;
// Write outputs from scratch into EValue pointers
for (int i = 0; i < handles.outputs->count; i++) {
int tensor_count = 1, io_count = 1;
Expand All @@ -338,23 +339,39 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
calculate_dimensions(
tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);

// At times the topological order of the outputs may change.
// Lets instead ensure that the sum of dimensions match.
tensor_dim = tensor_dim + tensor_count;
io_dim = io_dim + io_count;
size_t tensor_bytes = tensor_out.nbytes();
size_t io_bytes = static_cast<size_t>(io_count) *
static_cast<size_t>(handles.outputs->io[i].elem_size);

if (tensor_bytes != io_bytes) {
Error status = copy_with_layout_adjustment(
handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes);
if (status != Error::Ok) {
return status;
}
io_bytes_total += tensor_bytes;
} else {
EXECUTORCH_PROF_SCOPE(
event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");

EXECUTORCH_PROF_SCOPE(
event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
memcpy(
tensor_out.mutable_data_ptr<char>(),
static_cast<const char*>(output_addr),
tensor_bytes);
io_bytes_total += io_bytes;
}

memcpy(
tensor_out.mutable_data_ptr<char>(),
static_cast<const char*>(output_addr),
tensor_out.nbytes());
// At times the topological order of the outputs may change.
// Lets instead ensure that the sum of output bytes match.
tensor_bytes_total += tensor_bytes;
}
if (tensor_dim != io_dim) {
if (tensor_bytes_total != io_bytes_total) {
ET_LOG(Error, "Total output tensor sizes do not match");
ET_LOG(
Error, "Program expects size of %d but got %d", tensor_dim, io_dim);
Error,
"Program expects %zu bytes but got %zu",
io_bytes_total,
tensor_bytes_total);
return Error::InvalidProgram;
}
return Error::Ok;
Expand All @@ -365,6 +382,147 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
}

private:
// Copies Vela output into the ExecuTorch tensor, adjusting for padding or
// packed layouts produced by the delegate.
Error copy_with_layout_adjustment(
const VelaIO& output_io,
int output_index,
const char* src,
executorch::aten::Tensor& tensor_out,
size_t tensor_bytes) const {
const int elem_size = output_io.elem_size;
if (elem_size == 0) {
ET_LOG(
Error, "Ethos-U output %d reports zero element size", output_index);
return Error::InvalidProgram;
}

size_t chunk_count = 1;
for (int dim = 0; dim < shapeDim - 1; ++dim) {
const int vela_dim = output_io.shape[dim];
chunk_count *= static_cast<size_t>(vela_dim == 0 ? 1 : vela_dim);
}
const int last_dim = output_io.shape[shapeDim - 1];
const size_t vela_chunk_elems =
static_cast<size_t>(last_dim == 0 ? 1 : last_dim);
const size_t vela_chunk_size =
vela_chunk_elems * static_cast<size_t>(elem_size);

if (tensor_bytes % chunk_count != 0) {
ET_LOG(
Error,
"Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu",
output_index,
tensor_bytes,
chunk_count);
return Error::InvalidProgram;
}

const size_t chunk_size = tensor_bytes / chunk_count;

// If Vela writes fewer bytes than the tensor expects we may need to
// expand 4-bit data to 8-bit. Ethos-U outputs may be
// packed 4-bit values but ExecuTorch tensors are at least 8-bit.
if (vela_chunk_size < chunk_size) {
if (chunk_size % vela_chunk_size != 0) {
ET_LOG(
Error,
"Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu",
output_index,
chunk_size,
vela_chunk_size);
return Error::InvalidProgram;
}

const size_t expand_factor = chunk_size / vela_chunk_size;
if (expand_factor == 2 && elem_size == 1 &&
tensor_out.scalar_type() == ScalarType::Char) {
return unpack_chunks_4bit_to_int8(
reinterpret_cast<const uint8_t*>(src),
tensor_out.mutable_data_ptr<int8_t>(),
chunk_count,
chunk_size,
vela_chunk_size);
}

ET_LOG(
Error,
"Ethos-U output %d expansion factor %zu with element size %d not supported",
output_index,
expand_factor,
elem_size);
return Error::InvalidProgram;
}

return strip_delegate_padding(
src,
tensor_out.mutable_data_ptr<char>(),
chunk_count,
chunk_size,
vela_chunk_size);
}

Error unpack_chunks_4bit_to_int8(
const uint8_t* src,
int8_t* dest,
size_t chunk_count,
size_t dest_chunk_size,
size_t src_chunk_size) const {
const uint8_t* chunk_src = src;
int8_t* chunk_dest = dest;
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size);
chunk_src += src_chunk_size;
chunk_dest += dest_chunk_size;
}
return Error::Ok;
}

void unpack_single_chunk_4bit_to_int8(
const uint8_t* src,
int8_t* dest,
size_t chunk_size) const {
for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) {
const uint8_t packed = src[byte_idx];
int8_t low = static_cast<int8_t>(packed & 0x0F);
int8_t high = static_cast<int8_t>((packed >> 4) & 0x0F);
if (low >= 8) {
low -= 16;
}
if (high >= 8) {
high -= 16;
}
dest[2 * byte_idx] = low;
dest[2 * byte_idx + 1] = high;
}
}

Error strip_delegate_padding(
const char* src,
char* dest,
size_t chunk_count,
size_t dest_chunk_size,
size_t src_chunk_size) const {
if (dest_chunk_size > src_chunk_size) {
ET_LOG(
Error,
"dest chunk size %zu must not exceed src chunk size %zu",
dest_chunk_size,
src_chunk_size);
return Error::InvalidProgram;
}
if (src == nullptr || dest == nullptr) {
ET_LOG(Error, "Ethos-U padded copy received null buffer");
return Error::InvalidState;
}
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
memcpy(dest, src, dest_chunk_size);
src += src_chunk_size;
dest += dest_chunk_size;
}
return Error::Ok;
}

void calculate_dimensions(
const executorch::aten::Tensor tensor,
VelaIO* io,
Expand All @@ -389,4 +547,4 @@ static auto registered = register_backend(backend_id);

} // namespace arm
} // namespace backends
} // namespace executorch
} // namespace executorch
Loading