pytorch · mergennachin · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -8,6 +8,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -16,6 +17,7 @@
 #include <filesystem>
 #include <fstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 // Include our shim layer headers
@@ -46,9 +48,46 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
+// Structure to hold a reference to a GPU tensor for "keep on device"
+// optimization. Owns the tensor handle - must be deleted when no longer needed.
+struct GpuTensorRef {
+  AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
+  void* data_ptr; // GPU memory pointer (for D2D copy)
+  size_t size_bytes; // Total size in bytes
+};
+
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // Storage control options (set via set_option before execute)
+  mutable std::string
+      store_output_name_; // Name to store output under (empty = none)
+  mutable std::string
+      use_stored_input_name_; // Name of stored tensor to use (empty = none)
+
+  // Per-instance map of named GPU tensor references.
+  // Mutable because execute() is const but needs to modify this.
+  //
+  // LIFETIME CONTRACT:
+  // - Stored tensors are valid until overwritten or destroy() is called.
+  // - Caller must ensure the producing execute() call (e.g., encoder) completes
+  //   before any consuming execute() call (e.g., decoder) begins.
+  // - Caller must not call destroy() while execute() is in progress.
+  // - Overwriting a tensor (same name) deletes the old tensor immediately,
+  //   so caller must ensure no concurrent execute() is using it.
+  mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
+
+  // Helper to clear stored GPU tensors and free their memory.
+  // Only call when no execute() is in progress.
+  void clear_gpu_tensors() const {
+    for (auto& pair : gpu_tensors_) {
+      if (pair.second.handle != nullptr) {
+        aoti_torch_delete_tensor_object(pair.second.handle);
+      }
+    }
+    gpu_tensors_.clear();
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -91,6 +130,51 @@ class ET_EXPERIMENTAL CudaBackend final
     return 1;
   }
 
+  Error set_option(
+      __ET_UNUSED executorch::runtime::BackendOptionContext& context,
+      const executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) override {
+    for (size_t i = 0; i < backend_options.size(); i++) {
+      const auto& option = backend_options[i];
+      // Handle store_output: expects a string name (e.g., "encoder_output")
+      if (strcmp(option.key, "store_output") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          store_output_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Warning, "store_output option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
+      // Handle use_stored_input: expects a string name (e.g., "encoder_output")
+      else if (strcmp(option.key, "use_stored_input") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          use_stored_input_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Warning, "use_stored_input option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
+      // Handle reset_stored_input: expects a boolean value
+      // Note: This only resets the name setting. The stored GPU tensor
+      // remains in memory until overwritten or destroy() is called.
+      else if (strcmp(option.key, "reset_stored_input") == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          if (*val) {
+            use_stored_input_name_.clear();
+          }
+        } else {
+          ET_LOG(Warning, "reset_stored_input option expects a boolean value");
+          return Error::InvalidArgument;
+        }
+      }
+    }
+    return Error::Ok;
+  }
+
   // Once per loaded binary blob
   Result<DelegateHandle*> init(
       BackendInitContext& context,
@@ -222,15 +306,48 @@ class ET_EXPERIMENTAL CudaBackend final
     std::vector<AOTITensorHandle> gpu_outputs(
         n_outputs); // GPU tensors for kernel output
 
+    // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
+    // Prevents memory leaks when errors occur during execute().
+    struct TensorCleanup {
+      std::vector<AOTITensorHandle>& inputs;
+      std::vector<AOTITensorHandle>& outputs;
+      const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
+
+      ~TensorCleanup() {
+        // Clean up input tensors
+        for (auto* handle : inputs) {
+          if (handle != nullptr) {
+            aoti_torch_delete_tensor_object(handle);
+          }
+        }
+        // Clean up output tensors, except those that are stored
+        for (auto* handle : outputs) {
+          if (handle != nullptr) {
+            bool is_stored = false;
+            for (const auto& pair : stored_tensors) {
+              if (pair.second.handle == handle) {
+                is_stored = true;
+                break;
+              }
+            }
+            if (!is_stored) {
+              aoti_torch_delete_tensor_object(handle);
+            }
+          }
+        }
+      }
+    };
+    TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
+
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
+    // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
     for (int i = 0; i < n_inputs; i++) {
       // Get tensor dimensions and properties from ExecuTorch CPU tensor
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
 
-      // Create GPU tensor with same shape
+      // Create GPU tensor with same shape (always needed for AOTI format)
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
 
       AOTITensorHandle gpu_input_handle;
@@ -251,7 +368,48 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Copy data from CPU to GPU
+      // Check if this input matches a stored GPU tensor (by size).
+      // Note: Size-based matching assumes only one input will match. If
+      // multiple inputs have the same byte size as the stored tensor, the first
+      // match wins.
+      if (!use_stored_input_name_.empty()) {
+        auto it = gpu_tensors_.find(use_stored_input_name_);
+        if (it != gpu_tensors_.end()) {
+          const GpuTensorRef& ref = it->second;
+          size_t numel = gpu_inputs[i]->numel();
+          size_t elem_size = gpu_inputs[i]->element_size();
+          size_t copy_bytes = numel * elem_size;
+
+          // Match by size: use stored tensor if sizes match
+          if (copy_bytes == ref.size_bytes) {
+            ET_LOG(
+                Debug,
+                "Using stored tensor '%s' for input %d (%zu bytes, D2D copy)",
+                use_stored_input_name_.c_str(),
+                i,
+                copy_bytes);
+
+            // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
+            cudaError_t cuda_err = cudaMemcpy(
+                gpu_inputs[i]->data_ptr(),
+                ref.data_ptr,
+                copy_bytes,
+                cudaMemcpyDeviceToDevice);
+
+            ET_CHECK_OR_RETURN_ERROR(
+                cuda_err == cudaSuccess,
+                Internal,
+                "Failed GPU-to-GPU copy for input %d: %s",
+                i,
+                cudaGetErrorString(cuda_err));
+
+            // Skip the CPU-to-GPU copy below
+            continue;
+          }
+        }
+      }
+
+      // Copy data from CPU to GPU (normal path)
       ET_CHECK_OR_RETURN_ERROR(
           aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
           Internal,
@@ -303,6 +461,41 @@ class ET_EXPERIMENTAL CudaBackend final
         "AOTInductorModelContainerRun failed with error code %d",
         error);
 
+    // Store reference to output GPU tensor if requested.
+    // The tensor will be kept alive for later D2D copy to decoder inputs.
+    if (!store_output_name_.empty()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          n_outputs == 1,
+          InvalidArgument,
+          "store_output only supports single-output methods, got %zu outputs",
+          n_outputs);
+
+      auto* gpu_tensor = gpu_outputs[0];
+      size_t numel = gpu_tensor->numel();
+      size_t elem_size = gpu_tensor->element_size();
+      size_t size_bytes = numel * elem_size;
+
+      // Delete old tensor if overwriting (erase first to prevent double-free)
+      auto old_it = gpu_tensors_.find(store_output_name_);
+      if (old_it != gpu_tensors_.end()) {
+        AOTITensorHandle old_handle = old_it->second.handle;
+        gpu_tensors_.erase(old_it); // Remove from map before deleting
+        if (old_handle != nullptr) {
+          aoti_torch_delete_tensor_object(old_handle);
+        }
+      }
+
+      // Store tensor reference (we now own this tensor)
+      GpuTensorRef ref;
+      ref.handle = gpu_tensor;
+      ref.data_ptr = gpu_tensor->data_ptr();
+      ref.size_bytes = size_bytes;
+      gpu_tensors_[store_output_name_] = ref;
+
+      // Reset store_output name after storing
+      store_output_name_.clear();
+    }
+
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
@@ -317,6 +510,16 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
+    // Memory management notes:
+    // - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
+    // - use_stored_input setting persists across execute() calls to support
+    //   decoder loops that reuse the stored encoder output.
+    // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
+    //   (a) overwritten by a new tensor with the same name, or
+    //   (b) destroy() is called, which frees all stored tensors.
+    // - The "reset_stored_input" option only resets the input name setting,
+    //   NOT the stored GPU tensors themselves.
+
     return Error::Ok;
   }
 
@@ -326,6 +529,9 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Delete stored GPU tensors
+    clear_gpu_tensors();
+
     // Destroy the CUDA stream if it exists
     if (handle->cuda_stream != nullptr) {
       cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);

diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
@@ -16,6 +16,8 @@
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
@@ -196,6 +198,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
+  // Tell CUDA backend to store encoder output as "encoder_output"
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("store_output", "encoder_output");
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Warning,
+          "Failed to set store_output option (backend may not support storage)");
+    }
+  }
+
   auto encoder_result =
       module_->execute(kEncoderMethodName, preprocessed_features);
   ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
@@ -249,6 +263,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(decoder_input_ptr);
   decoder_inputs.emplace_back(encoder_output_ptr);
   decoder_inputs.emplace_back(cache_position_ptr);
+
+  // Tell CUDA backend to use stored encoder output for matching decoder inputs.
+  // The backend matches by tensor size, avoiding redundant CPU->GPU copies.
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("use_stored_input", "encoder_output");
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Warning,
+          "Failed to set use_stored_input option (backend may not support storage)");
+    }
+  }
+
   // Add some green coloring for the first generated token
   // token_callback("\033[1;32m");
   while (generated_tokens < config.max_new_tokens) {
@@ -304,6 +332,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
       break;
     }
   }
+
+  // Reset stored input settings after decoder loop completes.
+  // This disables the D2D copy optimization for subsequent execute() calls.
+  // Note: The stored GPU tensor remains in memory until the next encoder run
+  // (which overwrites it) or until the backend is destroyed.
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("reset_stored_input", true);
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Warning, "Failed to set reset_stored_input option");
+    }
+  }
+
   // Reset coloring
   // token_callback("\033[0m");
   // Update stats and print report