1212#include < executorch/runtime/core/error.h>
1313#include < executorch/runtime/core/evalue.h>
1414#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
15+ #include < climits>
1516#include < cstdio>
1617
1718#include < filesystem>
@@ -56,18 +57,40 @@ struct GpuTensorRef {
5657 size_t size_bytes; // Total size in bytes
5758};
5859
59- // Global map of named GPU tensor references.
60- // Note: NOT thread-safe. Callers must ensure execute() is called from a single thread.
61- static std::unordered_map<std::string, GpuTensorRef> g_gpu_tensors;
60+ // Parses "slot:name" format string. Returns true on success.
61+ // Uses character-by-character parsing to avoid std::stoi exceptions.
62+ static bool parse_slot_name (
63+ const std::string& val,
64+ int & out_slot,
65+ std::string& out_name) {
66+ auto colon_pos = val.find (' :' );
67+ if (colon_pos == std::string::npos || colon_pos == 0 ) {
68+ return false ;
69+ }
6270
63- // Helper to clear stored GPU tensors and free their memory
64- static void clear_gpu_tensors () {
65- for (auto & pair : g_gpu_tensors) {
66- if (pair.second .handle != nullptr ) {
67- aoti_torch_delete_tensor_object (pair.second .handle );
71+ // Parse slot number manually to avoid exceptions
72+ int slot = 0 ;
73+ for (size_t i = 0 ; i < colon_pos; i++) {
74+ char c = val[i];
75+ if (c < ' 0' || c > ' 9' ) {
76+ return false ; // Non-digit character
77+ }
78+ int digit = c - ' 0' ;
79+ // Check for overflow
80+ if (slot > (INT_MAX - digit) / 10 ) {
81+ return false ;
6882 }
83+ slot = slot * 10 + digit;
6984 }
70- g_gpu_tensors.clear ();
85+
86+ std::string name = val.substr (colon_pos + 1 );
87+ if (name.empty ()) {
88+ return false ;
89+ }
90+
91+ out_slot = slot;
92+ out_name = std::move (name);
93+ return true ;
7194}
7295
7396class ET_EXPERIMENTAL CudaBackend final
@@ -79,6 +102,29 @@ class ET_EXPERIMENTAL CudaBackend final
79102 mutable int use_cache_input_slot_ = -1 ; // Which input slot to use cache for (-1 = none)
80103 mutable std::string use_cache_input_name_; // Name of cached tensor to use
81104
105+ // Per-instance map of named GPU tensor references.
106+ // Mutable because execute() is const but needs to modify this.
107+ //
108+ // LIFETIME CONTRACT:
109+ // - Stored tensors are valid until overwritten or destroy() is called.
110+ // - Caller must ensure the producing execute() call (e.g., encoder) completes
111+ // before any consuming execute() call (e.g., decoder) begins.
112+ // - Caller must not call destroy() while execute() is in progress.
113+ // - Overwriting a tensor (same cache name) deletes the old tensor immediately,
114+ // so caller must ensure no concurrent execute() is using it.
115+ mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
116+
117+ // Helper to clear stored GPU tensors and free their memory.
118+ // Only call when no execute() is in progress.
119+ void clear_gpu_tensors () const {
120+ for (auto & pair : gpu_tensors_) {
121+ if (pair.second .handle != nullptr ) {
122+ aoti_torch_delete_tensor_object (pair.second .handle );
123+ }
124+ }
125+ gpu_tensors_.clear ();
126+ }
127+
82128 Error load_function_pointers_into_handle (
83129 void * so_handle,
84130 AOTIDelegateHandle* handle) const {
@@ -133,46 +179,38 @@ class ET_EXPERIMENTAL CudaBackend final
133179 std::array<char , executorch::runtime::kMaxOptionValueLength >>(
134180 &option.value )) {
135181 std::string val (arr->data ());
136- auto colon_pos = val.find (' :' );
137- if (colon_pos != std::string::npos) {
138- try {
139- cache_output_slot_ = std::stoi (val.substr (0 , colon_pos));
140- cache_output_name_ = val.substr (colon_pos + 1 );
141- } catch (const std::exception& e) {
142- ET_LOG (
143- Error,
144- " Invalid cache_output format '%s': %s" ,
145- val.c_str (),
146- e.what ());
147- return Error::InvalidArgument;
148- }
182+ int slot;
183+ std::string name;
184+ if (parse_slot_name (val, slot, name)) {
185+ cache_output_slot_ = slot;
186+ cache_output_name_ = std::move (name);
187+ } else {
188+ ET_LOG (Error, " Invalid cache_output format: '%s'" , val.c_str ());
189+ return Error::InvalidArgument;
149190 }
150191 }
151192 }
152- // Handle use_cache_input: "slot:name" format (e.g., "1 :encoder_output")
193+ // Handle use_cache_input: "slot:name" format (e.g., "2 :encoder_output")
153194 else if (strcmp (option.key , " use_cache_input" ) == 0 ) {
154195 if (auto * arr = std::get_if<
155196 std::array<char , executorch::runtime::kMaxOptionValueLength >>(
156197 &option.value )) {
157198 std::string val (arr->data ());
158- auto colon_pos = val.find (' :' );
159- if (colon_pos != std::string::npos) {
160- try {
161- use_cache_input_slot_ = std::stoi (val.substr (0 , colon_pos));
162- use_cache_input_name_ = val.substr (colon_pos + 1 );
163- } catch (const std::exception& e) {
164- ET_LOG (
165- Error,
166- " Invalid use_cache_input format '%s': %s" ,
167- val.c_str (),
168- e.what ());
169- return Error::InvalidArgument;
170- }
199+ int slot;
200+ std::string name;
201+ if (parse_slot_name (val, slot, name)) {
202+ use_cache_input_slot_ = slot;
203+ use_cache_input_name_ = std::move (name);
204+ } else {
205+ ET_LOG (Error, " Invalid use_cache_input format: '%s'" , val.c_str ());
206+ return Error::InvalidArgument;
171207 }
172208 }
173209 }
174- // Handle clear_cache_input: reset input cache settings
175- else if (strcmp (option.key , " clear_cache_input" ) == 0 ) {
210+ // Handle reset_cache_input: disable cache input for subsequent execute() calls.
211+ // Note: This only resets the slot/name settings. The stored GPU tensor
212+ // remains in memory until overwritten or destroy() is called.
213+ else if (strcmp (option.key , " reset_cache_input" ) == 0 ) {
176214 if (auto * val = std::get_if<bool >(&option.value )) {
177215 if (*val) {
178216 use_cache_input_slot_ = -1 ;
@@ -308,6 +346,28 @@ class ET_EXPERIMENTAL CudaBackend final
308346 n_outputs,
309347 args.size ())
310348
349+ // Validate cache slot indices if set
350+ if (use_cache_input_slot_ >= 0 &&
351+ use_cache_input_slot_ >= static_cast <int >(n_inputs)) {
352+ ET_LOG (
353+ Warning,
354+ " use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring" ,
355+ use_cache_input_slot_,
356+ n_inputs);
357+ use_cache_input_slot_ = -1 ;
358+ use_cache_input_name_.clear ();
359+ }
360+ if (cache_output_slot_ >= 0 &&
361+ cache_output_slot_ >= static_cast <int >(n_outputs)) {
362+ ET_LOG (
363+ Warning,
364+ " cache_output slot %d is out of bounds (n_outputs=%zu), ignoring" ,
365+ cache_output_slot_,
366+ n_outputs);
367+ cache_output_slot_ = -1 ;
368+ cache_output_name_.clear ();
369+ }
370+
311371 // NOTE: ExecuTorch tensors are always on CPU/host memory
312372 // We need to create GPU copies for CUDA kernel execution
313373 std::vector<AOTITensorHandle> gpu_inputs (
@@ -346,8 +406,8 @@ class ET_EXPERIMENTAL CudaBackend final
346406
347407 // Check if this input slot should use a stored GPU tensor
348408 if (i == use_cache_input_slot_ && !use_cache_input_name_.empty ()) {
349- auto it = g_gpu_tensors .find (use_cache_input_name_);
350- if (it != g_gpu_tensors .end ()) {
409+ auto it = gpu_tensors_ .find (use_cache_input_name_);
410+ if (it != gpu_tensors_ .end ()) {
351411 const GpuTensorRef& ref = it->second ;
352412 // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
353413 size_t numel = gpu_inputs[i]->numel ();
@@ -358,8 +418,8 @@ class ET_EXPERIMENTAL CudaBackend final
358418 copy_bytes == ref.size_bytes ,
359419 Internal,
360420 " Stored tensor size mismatch: expected %zu bytes, got %zu" ,
361- copy_bytes ,
362- ref. size_bytes );
421+ ref. size_bytes ,
422+ copy_bytes );
363423
364424 cudaError_t cuda_err = cudaMemcpy (
365425 gpu_inputs[i]->data_ptr (),
@@ -434,25 +494,29 @@ class ET_EXPERIMENTAL CudaBackend final
434494
435495 // Store reference to output GPU tensor if requested.
436496 // The tensor will be kept alive for later D2D copy to decoder inputs.
437- if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast < int >(n_outputs) &&
438- !cache_output_name_.empty ()) {
497+ // (Bounds already validated at start of execute())
498+ if (cache_output_slot_ >= 0 && !cache_output_name_.empty ()) {
439499 auto * gpu_tensor = gpu_outputs[cache_output_slot_];
440500 size_t numel = gpu_tensor->numel ();
441501 size_t elem_size = gpu_tensor->element_size ();
442502 size_t size_bytes = numel * elem_size;
443503
444- // Delete old tensor if overwriting
445- auto old_it = g_gpu_tensors.find (cache_output_name_);
446- if (old_it != g_gpu_tensors.end () && old_it->second .handle != nullptr ) {
447- aoti_torch_delete_tensor_object (old_it->second .handle );
504+ // Delete old tensor if overwriting (erase first to prevent double-free)
505+ auto old_it = gpu_tensors_.find (cache_output_name_);
506+ if (old_it != gpu_tensors_.end ()) {
507+ AOTITensorHandle old_handle = old_it->second .handle ;
508+ gpu_tensors_.erase (old_it); // Remove from map before deleting
509+ if (old_handle != nullptr ) {
510+ aoti_torch_delete_tensor_object (old_handle);
511+ }
448512 }
449513
450514 // Store tensor reference (we now own this tensor)
451515 GpuTensorRef ref;
452516 ref.handle = gpu_tensor;
453517 ref.data_ptr = gpu_tensor->data_ptr ();
454518 ref.size_bytes = size_bytes;
455- g_gpu_tensors [cache_output_name_] = ref;
519+ gpu_tensors_ [cache_output_name_] = ref;
456520
457521 // Reset cache_output settings after caching
458522 cache_output_slot_ = -1 ;
@@ -473,21 +537,25 @@ class ET_EXPERIMENTAL CudaBackend final
473537 i);
474538 }
475539
476- // Note: use_cache_input settings are intentionally NOT reset here.
477- // They persist across execute() calls to support decoder loops that
478- // reuse cached encoder output. The caller should explicitly clear
479- // these settings using the "clear_cache_input" option when done.
540+ // Memory management notes:
541+ // - use_cache_input settings persist across execute() calls to support
542+ // decoder loops that reuse the stored encoder output.
543+ // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
544+ // (a) overwritten by a new tensor with the same name, or
545+ // (b) destroy() is called, which frees all stored tensors.
546+ // - The "reset_cache_input" option only resets the input slot/name settings,
547+ // NOT the stored GPU tensors themselves.
480548
481549 // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
482550 // Input tensors are no longer needed after AOTI execution.
483551 for (size_t i = 0 ; i < n_inputs; i++) {
484552 aoti_torch_delete_tensor_object (gpu_inputs[i]);
485553 }
486554 // Output tensors are no longer needed after copying to CPU,
487- // EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy).
555+ // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
488556 for (size_t i = 0 ; i < n_outputs; i++) {
489557 bool is_stored = false ;
490- for (const auto & pair : g_gpu_tensors ) {
558+ for (const auto & pair : gpu_tensors_ ) {
491559 if (pair.second .handle == gpu_outputs[i]) {
492560 is_stored = true ;
493561 break ;
0 commit comments