1212#include < executorch/runtime/core/error.h>
1313#include < executorch/runtime/core/evalue.h>
1414#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
15- #include < climits>
1615#include < cstdio>
1716
1817#include < filesystem>
@@ -49,20 +48,22 @@ using executorch::runtime::Result;
4948using executorch::runtime::Span;
5049using executorch::runtime::etensor::Tensor;
5150
52- // Structure to hold a reference to a GPU tensor for "keep on device" optimization.
53- // Owns the tensor handle - must be deleted when no longer needed.
51+ // Structure to hold a reference to a GPU tensor for "keep on device"
52+ // optimization. Owns the tensor handle - must be deleted when no longer needed.
5453struct GpuTensorRef {
55- AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
56- void * data_ptr; // GPU memory pointer (for D2D copy)
57- size_t size_bytes; // Total size in bytes
54+ AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
55+ void * data_ptr; // GPU memory pointer (for D2D copy)
56+ size_t size_bytes; // Total size in bytes
5857};
5958
6059class ET_EXPERIMENTAL CudaBackend final
6160 : public ::executorch::runtime::BackendInterface {
6261 private:
6362 // Storage control options (set via set_option before execute)
64- mutable std::string store_output_name_; // Name to store output under (empty = none)
65- mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
63+ mutable std::string
64+ store_output_name_; // Name to store output under (empty = none)
65+ mutable std::string
66+ use_stored_input_name_; // Name of stored tensor to use (empty = none)
6667
6768 // Per-instance map of named GPU tensor references.
6869 // Mutable because execute() is const but needs to modify this.
@@ -305,8 +306,41 @@ class ET_EXPERIMENTAL CudaBackend final
305306 std::vector<AOTITensorHandle> gpu_outputs (
306307 n_outputs); // GPU tensors for kernel output
307308
309+ // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
310+ // Prevents memory leaks when errors occur during execute().
311+ struct TensorCleanup {
312+ std::vector<AOTITensorHandle>& inputs;
313+ std::vector<AOTITensorHandle>& outputs;
314+ const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
315+
316+ ~TensorCleanup () {
317+ // Clean up input tensors
318+ for (auto * handle : inputs) {
319+ if (handle != nullptr ) {
320+ aoti_torch_delete_tensor_object (handle);
321+ }
322+ }
323+ // Clean up output tensors, except those that are stored
324+ for (auto * handle : outputs) {
325+ if (handle != nullptr ) {
326+ bool is_stored = false ;
327+ for (const auto & pair : stored_tensors) {
328+ if (pair.second .handle == handle) {
329+ is_stored = true ;
330+ break ;
331+ }
332+ }
333+ if (!is_stored) {
334+ aoti_torch_delete_tensor_object (handle);
335+ }
336+ }
337+ }
338+ }
339+ };
340+ TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
341+
308342 // Process input tensors: ExecuTorch provides CPU tensors, create GPU
309- // copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
343+ // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
310344 for (int i = 0 ; i < n_inputs; i++) {
311345 // Get tensor dimensions and properties from ExecuTorch CPU tensor
312346 auto cpu_tensor = &(args[i]->toTensor ());
@@ -334,7 +368,10 @@ class ET_EXPERIMENTAL CudaBackend final
334368
335369 gpu_inputs[i] = gpu_input_handle;
336370
337- // Check if this input matches a stored GPU tensor (by size)
371+ // Check if this input matches a stored GPU tensor (by size).
372+ // Note: Size-based matching assumes only one input will match. If
373+ // multiple inputs have the same byte size as the stored tensor, the first
374+ // match wins.
338375 if (!use_stored_input_name_.empty ()) {
339376 auto it = gpu_tensors_.find (use_stored_input_name_);
340377 if (it != gpu_tensors_.end ()) {
@@ -345,6 +382,13 @@ class ET_EXPERIMENTAL CudaBackend final
345382
346383 // Match by size: use stored tensor if sizes match
347384 if (copy_bytes == ref.size_bytes ) {
385+ ET_LOG (
386+ Debug,
387+ " Using stored tensor '%s' for input %d (%zu bytes, D2D copy)" ,
388+ use_stored_input_name_.c_str (),
389+ i,
390+ copy_bytes);
391+
348392 // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
349393 cudaError_t cuda_err = cudaMemcpy (
350394 gpu_inputs[i]->data_ptr (),
@@ -418,9 +462,14 @@ class ET_EXPERIMENTAL CudaBackend final
418462 error);
419463
420464 // Store reference to output GPU tensor if requested.
421- // Always uses gpu_outputs[0] (encoder has single output).
422465 // The tensor will be kept alive for later D2D copy to decoder inputs.
423- if (!store_output_name_.empty () && n_outputs > 0 ) {
466+ if (!store_output_name_.empty ()) {
467+ ET_CHECK_OR_RETURN_ERROR (
468+ n_outputs == 1 ,
469+ InvalidArgument,
470+ " store_output only supports single-output methods, got %zu outputs" ,
471+ n_outputs);
472+
424473 auto * gpu_tensor = gpu_outputs[0 ];
425474 size_t numel = gpu_tensor->numel ();
426475 size_t elem_size = gpu_tensor->element_size ();
@@ -430,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final
430479 auto old_it = gpu_tensors_.find (store_output_name_);
431480 if (old_it != gpu_tensors_.end ()) {
432481 AOTITensorHandle old_handle = old_it->second .handle ;
433- gpu_tensors_.erase (old_it); // Remove from map before deleting
482+ gpu_tensors_.erase (old_it); // Remove from map before deleting
434483 if (old_handle != nullptr ) {
435484 aoti_torch_delete_tensor_object (old_handle);
436485 }
@@ -462,6 +511,7 @@ class ET_EXPERIMENTAL CudaBackend final
462511 }
463512
464513 // Memory management notes:
514+ // - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
465515 // - use_stored_input setting persists across execute() calls to support
466516 // decoder loops that reuse the stored encoder output.
467517 // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
@@ -470,26 +520,6 @@ class ET_EXPERIMENTAL CudaBackend final
470520 // - The "reset_stored_input" option only resets the input name setting,
471521 // NOT the stored GPU tensors themselves.
472522
473- // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
474- // Input tensors are no longer needed after AOTI execution.
475- for (size_t i = 0 ; i < n_inputs; i++) {
476- aoti_torch_delete_tensor_object (gpu_inputs[i]);
477- }
478- // Output tensors are no longer needed after copying to CPU,
479- // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
480- for (size_t i = 0 ; i < n_outputs; i++) {
481- bool is_stored = false ;
482- for (const auto & pair : gpu_tensors_) {
483- if (pair.second .handle == gpu_outputs[i]) {
484- is_stored = true ;
485- break ;
486- }
487- }
488- if (!is_stored) {
489- aoti_torch_delete_tensor_object (gpu_outputs[i]);
490- }
491- }
492-
493523 return Error::Ok;
494524 }
495525
0 commit comments