@@ -57,50 +57,12 @@ struct GpuTensorRef {
5757 size_t size_bytes; // Total size in bytes
5858};
5959
60- // Parses "slot:name" format string. Returns true on success.
61- // Uses character-by-character parsing to avoid std::stoi exceptions.
62- static bool parse_slot_name (
63- const std::string& val,
64- int & out_slot,
65- std::string& out_name) {
66- auto colon_pos = val.find (' :' );
67- if (colon_pos == std::string::npos || colon_pos == 0 ) {
68- return false ;
69- }
70-
71- // Parse slot number manually to avoid exceptions
72- int slot = 0 ;
73- for (size_t i = 0 ; i < colon_pos; i++) {
74- char c = val[i];
75- if (c < ' 0' || c > ' 9' ) {
76- return false ; // Non-digit character
77- }
78- int digit = c - ' 0' ;
79- // Check for overflow
80- if (slot > (INT_MAX - digit) / 10 ) {
81- return false ;
82- }
83- slot = slot * 10 + digit;
84- }
85-
86- std::string name = val.substr (colon_pos + 1 );
87- if (name.empty ()) {
88- return false ;
89- }
90-
91- out_slot = slot;
92- out_name = std::move (name);
93- return true ;
94- }
95-
9660class ET_EXPERIMENTAL CudaBackend final
9761 : public ::executorch::runtime::BackendInterface {
9862 private:
99- // Cache control options (set via set_option before execute)
100- mutable int cache_output_slot_ = -1 ; // Which output slot to cache (-1 = none)
101- mutable std::string cache_output_name_; // Name to cache output under
102- mutable int use_cache_input_slot_ = -1 ; // Which input slot to use cache for (-1 = none)
103- mutable std::string use_cache_input_name_; // Name of cached tensor to use
63+ // Storage control options (set via set_option before execute)
64+ mutable std::string store_output_name_; // Name to store output under (empty = none)
65+ mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
10466
10567 // Per-instance map of named GPU tensor references.
10668 // Mutable because execute() is const but needs to modify this.
@@ -110,7 +72,7 @@ class ET_EXPERIMENTAL CudaBackend final
11072 // - Caller must ensure the producing execute() call (e.g., encoder) completes
11173 // before any consuming execute() call (e.g., decoder) begins.
11274 // - Caller must not call destroy() while execute() is in progress.
113- // - Overwriting a tensor (same cache name) deletes the old tensor immediately,
75+ // - Overwriting a tensor (same name) deletes the old tensor immediately,
11476 // so caller must ensure no concurrent execute() is using it.
11577 mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
11678
@@ -173,49 +135,39 @@ class ET_EXPERIMENTAL CudaBackend final
173135 backend_options) override {
174136 for (size_t i = 0 ; i < backend_options.size (); i++) {
175137 const auto & option = backend_options[i];
176- // Handle cache_output: "slot:name" format (e.g., "0: encoder_output")
177- if (strcmp (option.key , " cache_output " ) == 0 ) {
138+ // Handle store_output: expects a string name (e.g., "encoder_output")
139+ if (strcmp (option.key , " store_output " ) == 0 ) {
178140 if (auto * arr = std::get_if<
179141 std::array<char , executorch::runtime::kMaxOptionValueLength >>(
180142 &option.value )) {
181- std::string val (arr->data ());
182- int slot;
183- std::string name;
184- if (parse_slot_name (val, slot, name)) {
185- cache_output_slot_ = slot;
186- cache_output_name_ = std::move (name);
187- } else {
188- ET_LOG (Error, " Invalid cache_output format: '%s'" , val.c_str ());
189- return Error::InvalidArgument;
190- }
143+ store_output_name_ = std::string (arr->data ());
144+ } else {
145+ ET_LOG (Warning, " store_output option expects a string value" );
146+ return Error::InvalidArgument;
191147 }
192148 }
193- // Handle use_cache_input: "slot:name" format (e.g., "2: encoder_output")
194- else if (strcmp (option.key , " use_cache_input " ) == 0 ) {
149+ // Handle use_stored_input: expects a string name (e.g., "encoder_output")
150+ else if (strcmp (option.key , " use_stored_input " ) == 0 ) {
195151 if (auto * arr = std::get_if<
196152 std::array<char , executorch::runtime::kMaxOptionValueLength >>(
197153 &option.value )) {
198- std::string val (arr->data ());
199- int slot;
200- std::string name;
201- if (parse_slot_name (val, slot, name)) {
202- use_cache_input_slot_ = slot;
203- use_cache_input_name_ = std::move (name);
204- } else {
205- ET_LOG (Error, " Invalid use_cache_input format: '%s'" , val.c_str ());
206- return Error::InvalidArgument;
207- }
154+ use_stored_input_name_ = std::string (arr->data ());
155+ } else {
156+ ET_LOG (Warning, " use_stored_input option expects a string value" );
157+ return Error::InvalidArgument;
208158 }
209159 }
210- // Handle reset_cache_input: disable cache input for subsequent execute() calls.
211- // Note: This only resets the slot/ name settings . The stored GPU tensor
160+ // Handle reset_stored_input: expects a boolean value
161+ // Note: This only resets the name setting . The stored GPU tensor
212162 // remains in memory until overwritten or destroy() is called.
213- else if (strcmp (option.key , " reset_cache_input " ) == 0 ) {
163+ else if (strcmp (option.key , " reset_stored_input " ) == 0 ) {
214164 if (auto * val = std::get_if<bool >(&option.value )) {
215165 if (*val) {
216- use_cache_input_slot_ = -1 ;
217- use_cache_input_name_.clear ();
166+ use_stored_input_name_.clear ();
218167 }
168+ } else {
169+ ET_LOG (Warning, " reset_stored_input option expects a boolean value" );
170+ return Error::InvalidArgument;
219171 }
220172 }
221173 }
@@ -346,28 +298,6 @@ class ET_EXPERIMENTAL CudaBackend final
346298 n_outputs,
347299 args.size ())
348300
349- // Validate cache slot indices if set
350- if (use_cache_input_slot_ >= 0 &&
351- use_cache_input_slot_ >= static_cast <int >(n_inputs)) {
352- ET_LOG (
353- Warning,
354- " use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring" ,
355- use_cache_input_slot_,
356- n_inputs);
357- use_cache_input_slot_ = -1 ;
358- use_cache_input_name_.clear ();
359- }
360- if (cache_output_slot_ >= 0 &&
361- cache_output_slot_ >= static_cast <int >(n_outputs)) {
362- ET_LOG (
363- Warning,
364- " cache_output slot %d is out of bounds (n_outputs=%zu), ignoring" ,
365- cache_output_slot_,
366- n_outputs);
367- cache_output_slot_ = -1 ;
368- cache_output_name_.clear ();
369- }
370-
371301 // NOTE: ExecuTorch tensors are always on CPU/host memory
372302 // We need to create GPU copies for CUDA kernel execution
373303 std::vector<AOTITensorHandle> gpu_inputs (
@@ -404,40 +334,35 @@ class ET_EXPERIMENTAL CudaBackend final
404334
405335 gpu_inputs[i] = gpu_input_handle;
406336
407- // Check if this input slot should use a stored GPU tensor
408- if (i == use_cache_input_slot_ && !use_cache_input_name_ .empty ()) {
409- auto it = gpu_tensors_.find (use_cache_input_name_ );
337+ // Check if this input matches a stored GPU tensor (by size)
338+ if (!use_stored_input_name_ .empty ()) {
339+ auto it = gpu_tensors_.find (use_stored_input_name_ );
410340 if (it != gpu_tensors_.end ()) {
411341 const GpuTensorRef& ref = it->second ;
412- // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
413342 size_t numel = gpu_inputs[i]->numel ();
414343 size_t elem_size = gpu_inputs[i]->element_size ();
415344 size_t copy_bytes = numel * elem_size;
416345
417- ET_CHECK_OR_RETURN_ERROR (
418- copy_bytes == ref.size_bytes ,
419- Internal,
420- " Stored tensor size mismatch: expected %zu bytes, got %zu" ,
421- ref.size_bytes ,
422- copy_bytes);
423-
424- cudaError_t cuda_err = cudaMemcpy (
425- gpu_inputs[i]->data_ptr (),
426- ref.data_ptr ,
427- copy_bytes,
428- cudaMemcpyDeviceToDevice);
429-
430- ET_CHECK_OR_RETURN_ERROR (
431- cuda_err == cudaSuccess,
432- Internal,
433- " Failed GPU-to-GPU copy for input %d: %s" ,
434- i,
435- cudaGetErrorString (cuda_err));
436-
437- // Skip the CPU-to-GPU copy below
438- continue ;
346+ // Match by size: use stored tensor if sizes match
347+ if (copy_bytes == ref.size_bytes ) {
348+ // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
349+ cudaError_t cuda_err = cudaMemcpy (
350+ gpu_inputs[i]->data_ptr (),
351+ ref.data_ptr ,
352+ copy_bytes,
353+ cudaMemcpyDeviceToDevice);
354+
355+ ET_CHECK_OR_RETURN_ERROR (
356+ cuda_err == cudaSuccess,
357+ Internal,
358+ " Failed GPU-to-GPU copy for input %d: %s" ,
359+ i,
360+ cudaGetErrorString (cuda_err));
361+
362+ // Skip the CPU-to-GPU copy below
363+ continue ;
364+ }
439365 }
440- // Not found: fall through to normal CPU-to-GPU copy
441366 }
442367
443368 // Copy data from CPU to GPU (normal path)
@@ -493,16 +418,16 @@ class ET_EXPERIMENTAL CudaBackend final
493418 error);
494419
495420 // Store reference to output GPU tensor if requested.
421+ // Always uses gpu_outputs[0] (encoder has single output).
496422 // The tensor will be kept alive for later D2D copy to decoder inputs.
497- // (Bounds already validated at start of execute())
498- if (cache_output_slot_ >= 0 && !cache_output_name_.empty ()) {
499- auto * gpu_tensor = gpu_outputs[cache_output_slot_];
423+ if (!store_output_name_.empty () && n_outputs > 0 ) {
424+ auto * gpu_tensor = gpu_outputs[0 ];
500425 size_t numel = gpu_tensor->numel ();
501426 size_t elem_size = gpu_tensor->element_size ();
502427 size_t size_bytes = numel * elem_size;
503428
504429 // Delete old tensor if overwriting (erase first to prevent double-free)
505- auto old_it = gpu_tensors_.find (cache_output_name_ );
430+ auto old_it = gpu_tensors_.find (store_output_name_ );
506431 if (old_it != gpu_tensors_.end ()) {
507432 AOTITensorHandle old_handle = old_it->second .handle ;
508433 gpu_tensors_.erase (old_it); // Remove from map before deleting
@@ -516,11 +441,10 @@ class ET_EXPERIMENTAL CudaBackend final
516441 ref.handle = gpu_tensor;
517442 ref.data_ptr = gpu_tensor->data_ptr ();
518443 ref.size_bytes = size_bytes;
519- gpu_tensors_[cache_output_name_ ] = ref;
444+ gpu_tensors_[store_output_name_ ] = ref;
520445
521- // Reset cache_output settings after caching
522- cache_output_slot_ = -1 ;
523- cache_output_name_.clear ();
446+ // Reset store_output name after storing
447+ store_output_name_.clear ();
524448 }
525449
526450 // Copy GPU output results back to CPU output tensors
@@ -538,12 +462,12 @@ class ET_EXPERIMENTAL CudaBackend final
538462 }
539463
540464 // Memory management notes:
541- // - use_cache_input settings persist across execute() calls to support
465+ // - use_stored_input setting persists across execute() calls to support
542466 // decoder loops that reuse the stored encoder output.
543467 // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
544468 // (a) overwritten by a new tensor with the same name, or
545469 // (b) destroy() is called, which frees all stored tensors.
546- // - The "reset_cache_input " option only resets the input slot/ name settings ,
470+ // - The "reset_stored_input " option only resets the input name setting ,
547471 // NOT the stored GPU tensors themselves.
548472
549473 // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
0 commit comments