Skip to content

Commit 308cad0

Browse files
perheldMartin Lindström
andauthored
Arm backend: Eliminate one memory copy in executor runner (pytorch#12992)
Prior to this patch, temporary tensors/buffers were used to hold the input data which was then copied over to the actual input tensors for running the inference. This patch removes this copying by instead writing the input data directly to the input tensors. Signed-off-by: [email protected] cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 Signed-off-by: [email protected] Co-authored-by: Martin Lindström <[email protected]>
1 parent b32be4a commit 308cad0

File tree

1 file changed

+37
-58
lines changed

1 file changed

+37
-58
lines changed

examples/arm/executor_runner/arm_executor_runner.cpp

Lines changed: 37 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ class Box {
289289
}
290290
};
291291

292-
Result<BufferCleanup> prepare_input_tensors(
292+
Error prepare_input_tensors(
293293
Method& method,
294294
MemoryAllocator& allocator,
295295
const std::vector<std::pair<char*, size_t>>& input_buffers) {
@@ -304,12 +304,15 @@ Result<BufferCleanup> prepare_input_tensors(
304304
"Wrong number of inputs allocated compared to method");
305305
#endif
306306

307-
void** inputs =
308-
static_cast<void**>(allocator.allocate(num_inputs * sizeof(void*)));
307+
EValue* input_evalues =
308+
static_cast<EValue*>(allocator.allocate(num_inputs * sizeof(EValue*)));
309309
ET_CHECK_OR_RETURN_ERROR(
310-
inputs != nullptr,
310+
input_evalues != nullptr,
311311
MemoryAllocationFailed,
312-
"Could not allocate memory for pointers to input buffers.");
312+
"Could not allocate memory for input evalues.");
313+
314+
Error err = method.get_inputs(input_evalues, num_inputs);
315+
ET_CHECK_OK_OR_RETURN_ERROR(err);
313316

314317
for (size_t i = 0; i < num_inputs; i++) {
315318
auto tag = method_meta.input_tag(i);
@@ -322,67 +325,54 @@ Result<BufferCleanup> prepare_input_tensors(
322325
Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
323326
ET_CHECK_OK_OR_RETURN_ERROR(tensor_meta.error());
324327

325-
// Input is a tensor. Allocate a buffer for it.
326-
void* data_ptr = allocator.allocate(tensor_meta->nbytes());
327-
ET_CHECK_OR_RETURN_ERROR(
328-
data_ptr != nullptr,
329-
MemoryAllocationFailed,
330-
"Could not allocate memory for input buffers.");
331-
inputs[num_allocated++] = data_ptr;
332-
333-
Error err = Error::Ok;
328+
err = Error::Ok;
334329
if (input_buffers.size() > 0) {
335330
auto [buffer, buffer_size] = input_buffers.at(i);
336331
if (buffer_size != tensor_meta->nbytes()) {
337332
ET_LOG(
338333
Error,
339-
"input size (%d) and tensor size (%d) missmatch!",
334+
"input size (%d) and tensor size (%d) mismatch!",
340335
buffer_size,
341336
tensor_meta->nbytes());
342337
err = Error::InvalidArgument;
343-
} else {
344-
ET_LOG(Info, "Copying read input to tensor.");
345-
std::memcpy(data_ptr, buffer, buffer_size);
338+
} else if (input_evalues[i].isTensor()) {
339+
// Copy the data from the input buffer to the tensor
340+
Tensor& tensor = input_evalues[i].toTensor();
341+
std::memcpy(tensor.mutable_data_ptr<int8_t>(), buffer, buffer_size);
346342
}
347343
}
348344

349-
TensorImpl impl = TensorImpl(
350-
tensor_meta.get().scalar_type(),
351-
tensor_meta.get().sizes().size(),
352-
const_cast<TensorImpl::SizesType*>(tensor_meta.get().sizes().data()),
353-
data_ptr,
354-
const_cast<TensorImpl::DimOrderType*>(
355-
tensor_meta.get().dim_order().data()));
356-
Tensor t(&impl);
357-
358345
// If input_buffers.size <= 0, we don't have any input, fill it with 1's.
359346
if (input_buffers.size() <= 0) {
360-
for (size_t j = 0; j < t.numel(); j++) {
361-
switch (t.scalar_type()) {
347+
if (input_evalues[i].isTensor()) {
348+
Tensor& tensor = input_evalues[i].toTensor();
349+
switch (tensor.scalar_type()) {
362350
case ScalarType::Int:
363-
t.mutable_data_ptr<int>()[j] = 1;
351+
std::fill(
352+
tensor.mutable_data_ptr<int>(),
353+
tensor.mutable_data_ptr<int>() + tensor.numel(),
354+
1);
364355
break;
365356
case ScalarType::Float:
366-
t.mutable_data_ptr<float>()[j] = 1.;
357+
std::fill(
358+
tensor.mutable_data_ptr<float>(),
359+
tensor.mutable_data_ptr<float>() + tensor.numel(),
360+
1.0);
367361
break;
368362
case ScalarType::Char:
369-
t.mutable_data_ptr<int8_t>()[j] = 1;
363+
std::fill(
364+
tensor.mutable_data_ptr<int8_t>(),
365+
tensor.mutable_data_ptr<int8_t>() + tensor.numel(),
366+
1);
370367
break;
371368
}
369+
} else {
370+
printf("Input[%d]: Not Tensor\n", i);
372371
}
373372
}
374-
375-
err = method.set_input(t, i);
376-
377-
if (err != Error::Ok) {
378-
ET_LOG(
379-
Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
380-
// The BufferCleanup will free the inputs when it goes out of scope.
381-
BufferCleanup cleanup({inputs, num_allocated});
382-
return err;
383-
}
384373
}
385-
return BufferCleanup({inputs, num_allocated});
374+
375+
return err;
386376
}
387377

388378
#if defined(SEMIHOSTING)
@@ -437,7 +427,6 @@ struct RunnerContext {
437427
size_t input_memsize = 0;
438428
size_t pte_size = 0;
439429
bool bundle_io = false;
440-
Box<Result<BufferCleanup>> prepared_inputs;
441430
Box<ArmMemoryAllocator> method_allocator;
442431
Box<ArmMemoryAllocator> temp_allocator;
443432
Box<Result<Method>> method;
@@ -591,20 +580,10 @@ void runner_init(
591580
} else
592581
#endif
593582
{
594-
// Here you would add code to get input from your Hardware
595-
// Get inputs from SEMIHOSTING or fake it with a lot of "1"
596-
// Use "static" to force to compiler to remove this when it goes out of
597-
// scope
598-
ctx.prepared_inputs.reset(::prepare_input_tensors(
599-
*ctx.method.value(), ctx.method_allocator.value(), input_buffers));
600-
601-
if (!ctx.prepared_inputs->ok()) {
602-
ET_LOG(
603-
Info,
604-
"Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
605-
ctx.method_name,
606-
ctx.prepared_inputs->error());
607-
}
583+
Error status = ::prepare_input_tensors(
584+
*ctx.method.value(), ctx.method_allocator.value(), input_buffers);
585+
ET_CHECK_MSG(
586+
status == Error::Ok, "Failed to prepare inputs 0x%" PRIx32, status);
608587
}
609588
#if defined(ET_DUMP_INPUT)
610589
{

0 commit comments

Comments
 (0)