@@ -59,6 +59,47 @@ struct GpuTensorRef {
5959class ET_EXPERIMENTAL CudaBackend final
6060 : public ::executorch::runtime::BackendInterface {
6161 private:
62+ // ============================================================================
63+ // GPU Tensor Storage for D2D Copy Optimization
64+ // ============================================================================
65+ //
66+ // This backend supports storing GPU tensors between execute() calls to enable
67+ // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies.
68+ // This is useful for encoder-decoder models where the encoder output is reused
69+ // across many decoder iterations.
70+ //
71+ // SUPPORTED OPTIONS (via set_option):
72+ //
73+ // "store_output" (string): Store the output tensor under this name after
74+ // the next execute() call. The tensor remains on GPU until cleared.
75+ // Only supports single-output methods.
76+ // Example: opts.set_option("store_output", "encoder_output");
77+ //
78+ // "use_stored_input" (string): For inputs matching the stored tensor's size,
79+ // use D2D copy from the stored tensor instead of H2D copy from CPU.
80+ // This setting persists across execute() calls until reset.
81+ // Example: opts.set_option("use_stored_input", "encoder_output");
82+ //
83+ // "reset_stored_input" (bool): Clear the use_stored_input setting.
84+ // Does NOT delete the stored tensor - only stops using it for D2D.
85+ // Example: opts.set_option("reset_stored_input", true);
86+ //
87+ // "clear_stored_tensor" (string): Delete the named tensor from storage,
88+ // freeing GPU memory. Use after decoder loop completes.
89+ // Example: opts.set_option("clear_stored_tensor", "encoder_output");
90+ //
91+ // TYPICAL USAGE PATTERN (encoder-decoder model):
92+ //
93+ // 1. Before encoder: set_option("store_output", "encoder_output")
94+ // 2. Execute encoder (output is stored on GPU)
95+ // 3. Before decoder loop: set_option("use_stored_input", "encoder_output")
96+ // 4. Execute decoder N times (D2D copies for encoder output input)
97+ // 5. After decoder loop:
98+ // set_option("reset_stored_input", true)
99+ // set_option("clear_stored_tensor", "encoder_output")
100+ //
101+ // ============================================================================
102+
62103 // Storage control options (set via set_option before execute)
63104 mutable std::string
64105 store_output_name_; // Name to store output under (empty = none)
@@ -171,6 +212,25 @@ class ET_EXPERIMENTAL CudaBackend final
171212 return Error::InvalidArgument;
172213 }
173214 }
215+ // Handle clear_stored_tensor: expects a string name
216+ // Deletes the named GPU tensor from storage, freeing GPU memory.
217+ else if (strcmp (option.key , " clear_stored_tensor" ) == 0 ) {
218+ if (auto * arr = std::get_if<
219+ std::array<char , executorch::runtime::kMaxOptionValueLength >>(
220+ &option.value )) {
221+ std::string name (arr->data ());
222+ auto it = gpu_tensors_.find (name);
223+ if (it != gpu_tensors_.end ()) {
224+ if (it->second .handle != nullptr ) {
225+ aoti_torch_delete_tensor_object (it->second .handle );
226+ }
227+ gpu_tensors_.erase (it);
228+ }
229+ } else {
230+ ET_LOG (Warning, " clear_stored_tensor option expects a string value" );
231+ return Error::InvalidArgument;
232+ }
233+ }
174234 }
175235 return Error::Ok;
176236 }
0 commit comments