Skip to content

Commit 9e1a3cc

Browse files
committed
RAII
1 parent e535aee commit 9e1a3cc

File tree

2 files changed

+72
-41
lines changed

2 files changed

+72
-41
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 63 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include <executorch/runtime/core/error.h>
1313
#include <executorch/runtime/core/evalue.h>
1414
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
15-
#include <climits>
1615
#include <cstdio>
1716

1817
#include <filesystem>
@@ -49,20 +48,22 @@ using executorch::runtime::Result;
4948
using executorch::runtime::Span;
5049
using executorch::runtime::etensor::Tensor;
5150

52-
// Structure to hold a reference to a GPU tensor for "keep on device" optimization.
53-
// Owns the tensor handle - must be deleted when no longer needed.
51+
// Structure to hold a reference to a GPU tensor for "keep on device"
52+
// optimization. Owns the tensor handle - must be deleted when no longer needed.
5453
struct GpuTensorRef {
55-
AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
56-
void* data_ptr; // GPU memory pointer (for D2D copy)
57-
size_t size_bytes; // Total size in bytes
54+
AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
55+
void* data_ptr; // GPU memory pointer (for D2D copy)
56+
size_t size_bytes; // Total size in bytes
5857
};
5958

6059
class ET_EXPERIMENTAL CudaBackend final
6160
: public ::executorch::runtime::BackendInterface {
6261
private:
6362
// Storage control options (set via set_option before execute)
64-
mutable std::string store_output_name_; // Name to store output under (empty = none)
65-
mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
63+
mutable std::string
64+
store_output_name_; // Name to store output under (empty = none)
65+
mutable std::string
66+
use_stored_input_name_; // Name of stored tensor to use (empty = none)
6667

6768
// Per-instance map of named GPU tensor references.
6869
// Mutable because execute() is const but needs to modify this.
@@ -305,8 +306,41 @@ class ET_EXPERIMENTAL CudaBackend final
305306
std::vector<AOTITensorHandle> gpu_outputs(
306307
n_outputs); // GPU tensors for kernel output
307308

309+
// RAII helper to ensure GPU tensors are cleaned up on all exit paths.
310+
// Prevents memory leaks when errors occur during execute().
311+
struct TensorCleanup {
312+
std::vector<AOTITensorHandle>& inputs;
313+
std::vector<AOTITensorHandle>& outputs;
314+
const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
315+
316+
~TensorCleanup() {
317+
// Clean up input tensors
318+
for (auto* handle : inputs) {
319+
if (handle != nullptr) {
320+
aoti_torch_delete_tensor_object(handle);
321+
}
322+
}
323+
// Clean up output tensors, except those that are stored
324+
for (auto* handle : outputs) {
325+
if (handle != nullptr) {
326+
bool is_stored = false;
327+
for (const auto& pair : stored_tensors) {
328+
if (pair.second.handle == handle) {
329+
is_stored = true;
330+
break;
331+
}
332+
}
333+
if (!is_stored) {
334+
aoti_torch_delete_tensor_object(handle);
335+
}
336+
}
337+
}
338+
}
339+
};
340+
TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
341+
308342
// Process input tensors: ExecuTorch provides CPU tensors, create GPU
309-
// copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
343+
// copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
310344
for (int i = 0; i < n_inputs; i++) {
311345
// Get tensor dimensions and properties from ExecuTorch CPU tensor
312346
auto cpu_tensor = &(args[i]->toTensor());
@@ -334,7 +368,10 @@ class ET_EXPERIMENTAL CudaBackend final
334368

335369
gpu_inputs[i] = gpu_input_handle;
336370

337-
// Check if this input matches a stored GPU tensor (by size)
371+
// Check if this input matches a stored GPU tensor (by size).
372+
// Note: Size-based matching assumes only one input will match. If
373+
// multiple inputs have the same byte size as the stored tensor, the first
374+
// match wins.
338375
if (!use_stored_input_name_.empty()) {
339376
auto it = gpu_tensors_.find(use_stored_input_name_);
340377
if (it != gpu_tensors_.end()) {
@@ -345,6 +382,13 @@ class ET_EXPERIMENTAL CudaBackend final
345382

346383
// Match by size: use stored tensor if sizes match
347384
if (copy_bytes == ref.size_bytes) {
385+
ET_LOG(
386+
Debug,
387+
"Using stored tensor '%s' for input %d (%zu bytes, D2D copy)",
388+
use_stored_input_name_.c_str(),
389+
i,
390+
copy_bytes);
391+
348392
// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
349393
cudaError_t cuda_err = cudaMemcpy(
350394
gpu_inputs[i]->data_ptr(),
@@ -418,9 +462,14 @@ class ET_EXPERIMENTAL CudaBackend final
418462
error);
419463

420464
// Store reference to output GPU tensor if requested.
421-
// Always uses gpu_outputs[0] (encoder has single output).
422465
// The tensor will be kept alive for later D2D copy to decoder inputs.
423-
if (!store_output_name_.empty() && n_outputs > 0) {
466+
if (!store_output_name_.empty()) {
467+
ET_CHECK_OR_RETURN_ERROR(
468+
n_outputs == 1,
469+
InvalidArgument,
470+
"store_output only supports single-output methods, got %zu outputs",
471+
n_outputs);
472+
424473
auto* gpu_tensor = gpu_outputs[0];
425474
size_t numel = gpu_tensor->numel();
426475
size_t elem_size = gpu_tensor->element_size();
@@ -430,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final
430479
auto old_it = gpu_tensors_.find(store_output_name_);
431480
if (old_it != gpu_tensors_.end()) {
432481
AOTITensorHandle old_handle = old_it->second.handle;
433-
gpu_tensors_.erase(old_it); // Remove from map before deleting
482+
gpu_tensors_.erase(old_it); // Remove from map before deleting
434483
if (old_handle != nullptr) {
435484
aoti_torch_delete_tensor_object(old_handle);
436485
}
@@ -462,6 +511,7 @@ class ET_EXPERIMENTAL CudaBackend final
462511
}
463512

464513
// Memory management notes:
514+
// - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
465515
// - use_stored_input setting persists across execute() calls to support
466516
// decoder loops that reuse the stored encoder output.
467517
// - Stored GPU tensors (in gpu_tensors_) remain in memory until:
@@ -470,26 +520,6 @@ class ET_EXPERIMENTAL CudaBackend final
470520
// - The "reset_stored_input" option only resets the input name setting,
471521
// NOT the stored GPU tensors themselves.
472522

473-
// Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
474-
// Input tensors are no longer needed after AOTI execution.
475-
for (size_t i = 0; i < n_inputs; i++) {
476-
aoti_torch_delete_tensor_object(gpu_inputs[i]);
477-
}
478-
// Output tensors are no longer needed after copying to CPU,
479-
// EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
480-
for (size_t i = 0; i < n_outputs; i++) {
481-
bool is_stored = false;
482-
for (const auto& pair : gpu_tensors_) {
483-
if (pair.second.handle == gpu_outputs[i]) {
484-
is_stored = true;
485-
break;
486-
}
487-
}
488-
if (!is_stored) {
489-
aoti_torch_delete_tensor_object(gpu_outputs[i]);
490-
}
491-
}
492-
493523
return Error::Ok;
494524
}
495525

extension/asr/runner/runner.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,11 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
202202
{
203203
::executorch::runtime::BackendOptions<1> opts;
204204
opts.set_option("store_output", "encoder_output");
205-
auto err =
206-
::executorch::runtime::set_option("CudaBackend", opts.view());
205+
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
207206
if (err != ::executorch::runtime::Error::Ok) {
208-
ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)");
207+
ET_LOG(
208+
Warning,
209+
"Failed to set store_output option (backend may not support storage)");
209210
}
210211
}
211212

@@ -268,10 +269,11 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
268269
{
269270
::executorch::runtime::BackendOptions<1> opts;
270271
opts.set_option("use_stored_input", "encoder_output");
271-
auto err =
272-
::executorch::runtime::set_option("CudaBackend", opts.view());
272+
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
273273
if (err != ::executorch::runtime::Error::Ok) {
274-
ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)");
274+
ET_LOG(
275+
Warning,
276+
"Failed to set use_stored_input option (backend may not support storage)");
275277
}
276278
}
277279

@@ -338,8 +340,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
338340
{
339341
::executorch::runtime::BackendOptions<1> opts;
340342
opts.set_option("reset_stored_input", true);
341-
auto err =
342-
::executorch::runtime::set_option("CudaBackend", opts.view());
343+
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
343344
if (err != ::executorch::runtime::Error::Ok) {
344345
ET_LOG(Warning, "Failed to set reset_stored_input option");
345346
}

0 commit comments

Comments
 (0)