Add clear_stored_tensor option

mergennachin · mergennachin · commit 3d0b62105608 · 2025-12-03T07:03:27.000-08:00
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -59,6 +59,47 @@ struct GpuTensorRef {
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // ============================================================================
+  // GPU Tensor Storage for D2D Copy Optimization
+  // ============================================================================
+  //
+  // This backend supports storing GPU tensors between execute() calls to enable
+  // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies.
+  // This is useful for encoder-decoder models where the encoder output is reused
+  // across many decoder iterations.
+  //
+  // SUPPORTED OPTIONS (via set_option):
+  //
+  //   "store_output" (string): Store the output tensor under this name after
+  //       the next execute() call. The tensor remains on GPU until cleared.
+  //       Only supports single-output methods.
+  //       Example: opts.set_option("store_output", "encoder_output");
+  //
+  //   "use_stored_input" (string): For inputs matching the stored tensor's size,
+  //       use D2D copy from the stored tensor instead of H2D copy from CPU.
+  //       This setting persists across execute() calls until reset.
+  //       Example: opts.set_option("use_stored_input", "encoder_output");
+  //
+  //   "reset_stored_input" (bool): Clear the use_stored_input setting.
+  //       Does NOT delete the stored tensor - only stops using it for D2D.
+  //       Example: opts.set_option("reset_stored_input", true);
+  //
+  //   "clear_stored_tensor" (string): Delete the named tensor from storage,
+  //       freeing GPU memory. Use after decoder loop completes.
+  //       Example: opts.set_option("clear_stored_tensor", "encoder_output");
+  //
+  // TYPICAL USAGE PATTERN (encoder-decoder model):
+  //
+  //   1. Before encoder: set_option("store_output", "encoder_output")
+  //   2. Execute encoder (output is stored on GPU)
+  //   3. Before decoder loop: set_option("use_stored_input", "encoder_output")
+  //   4. Execute decoder N times (D2D copies for encoder output input)
+  //   5. After decoder loop:
+  //        set_option("reset_stored_input", true)
+  //        set_option("clear_stored_tensor", "encoder_output")
+  //
+  // ============================================================================
+
   // Storage control options (set via set_option before execute)
   mutable std::string
       store_output_name_; // Name to store output under (empty = none)
@@ -171,6 +212,25 @@ class ET_EXPERIMENTAL CudaBackend final
           return Error::InvalidArgument;
         }
       }
+      // Handle clear_stored_tensor: expects a string name
+      // Deletes the named GPU tensor from storage, freeing GPU memory.
+      else if (strcmp(option.key, "clear_stored_tensor") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          std::string name(arr->data());
+          auto it = gpu_tensors_.find(name);
+          if (it != gpu_tensors_.end()) {
+            if (it->second.handle != nullptr) {
+              aoti_torch_delete_tensor_object(it->second.handle);
+            }
+            gpu_tensors_.erase(it);
+          }
+        } else {
+          ET_LOG(Warning, "clear_stored_tensor option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
     }
     return Error::Ok;
   }
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
@@ -333,16 +333,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Reset stored input settings after decoder loop completes.
-  // This disables the D2D copy optimization for subsequent execute() calls.
-  // Note: The stored GPU tensor remains in memory until the next encoder run
-  // (which overwrites it) or until the backend is destroyed.
+  // Reset stored input settings and free GPU memory after decoder loop completes.
+  // This disables the D2D copy optimization and releases the stored encoder output.
   {
-    ::executorch::runtime::BackendOptions<1> opts;
+    ::executorch::runtime::BackendOptions<2> opts;
     opts.set_option("reset_stored_input", true);
+    opts.set_option("clear_stored_tensor", "encoder_output");
     auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Warning, "Failed to set reset_stored_input option");
+      ET_LOG(Warning, "Failed to reset stored input settings");
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -333,16 +333,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(`
`333`	`333`	`}`
`334`	`334`	`}`
`335`	`335`
`336`		`- // Reset stored input settings after decoder loop completes.`
`337`		`- // This disables the D2D copy optimization for subsequent execute() calls.`
`338`		`- // Note: The stored GPU tensor remains in memory until the next encoder run`
`339`		`- // (which overwrites it) or until the backend is destroyed.`
	`336`	`+ // Reset stored input settings and free GPU memory after decoder loop completes.`
	`337`	`+ // This disables the D2D copy optimization and releases the stored encoder output.`
`340`	`338`	`{`
`341`		`- ::executorch::runtime::BackendOptions<1> opts;`
	`339`	`+ ::executorch::runtime::BackendOptions<2> opts;`
`342`	`340`	`opts.set_option("reset_stored_input", true);`
	`341`	`+ opts.set_option("clear_stored_tensor", "encoder_output");`
`343`	`342`	`auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());`
`344`	`343`	`if (err != ::executorch::runtime::Error::Ok) {`
`345`		`- ET_LOG(Warning, "Failed to set reset_stored_input option");`
	`344`	`+ ET_LOG(Warning, "Failed to reset stored input settings");`
`346`	`345`	`}`
`347`	`346`	`}`
`348`	`347`