Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 209 additions & 3 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <cuda_runtime.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/backend/options.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
Expand All @@ -16,6 +17,7 @@
#include <filesystem>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>

// Include our shim layer headers
Expand Down Expand Up @@ -46,9 +48,46 @@ using executorch::runtime::Result;
using executorch::runtime::Span;
using executorch::runtime::etensor::Tensor;

// Structure to hold a reference to a GPU tensor for "keep on device"
// optimization. Owns the tensor handle - must be deleted when no longer needed.
struct GpuTensorRef {
AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
void* data_ptr; // GPU memory pointer (for D2D copy)
size_t size_bytes; // Total size in bytes
};

class ET_EXPERIMENTAL CudaBackend final
: public ::executorch::runtime::BackendInterface {
private:
// Storage control options (set via set_option before execute)
mutable std::string
store_output_name_; // Name to store output under (empty = none)
mutable std::string
use_stored_input_name_; // Name of stored tensor to use (empty = none)

// Per-instance map of named GPU tensor references.
// Mutable because execute() is const but needs to modify this.
//
// LIFETIME CONTRACT:
// - Stored tensors are valid until overwritten or destroy() is called.
// - Caller must ensure the producing execute() call (e.g., encoder) completes
// before any consuming execute() call (e.g., decoder) begins.
// - Caller must not call destroy() while execute() is in progress.
// - Overwriting a tensor (same name) deletes the old tensor immediately,
// so caller must ensure no concurrent execute() is using it.
mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;

// Helper to clear stored GPU tensors and free their memory.
// Only call when no execute() is in progress.
void clear_gpu_tensors() const {
for (auto& pair : gpu_tensors_) {
if (pair.second.handle != nullptr) {
aoti_torch_delete_tensor_object(pair.second.handle);
}
}
gpu_tensors_.clear();
}

Error load_function_pointers_into_handle(
void* so_handle,
AOTIDelegateHandle* handle) const {
Expand Down Expand Up @@ -91,6 +130,51 @@ class ET_EXPERIMENTAL CudaBackend final
return 1;
}

Error set_option(
__ET_UNUSED executorch::runtime::BackendOptionContext& context,
const executorch::runtime::Span<executorch::runtime::BackendOption>&
backend_options) override {
for (size_t i = 0; i < backend_options.size(); i++) {
const auto& option = backend_options[i];
// Handle store_output: expects a string name (e.g., "encoder_output")
if (strcmp(option.key, "store_output") == 0) {
if (auto* arr = std::get_if<
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
&option.value)) {
store_output_name_ = std::string(arr->data());
} else {
ET_LOG(Warning, "store_output option expects a string value");
return Error::InvalidArgument;
}
}
// Handle use_stored_input: expects a string name (e.g., "encoder_output")
else if (strcmp(option.key, "use_stored_input") == 0) {
if (auto* arr = std::get_if<
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
&option.value)) {
use_stored_input_name_ = std::string(arr->data());
} else {
ET_LOG(Warning, "use_stored_input option expects a string value");
return Error::InvalidArgument;
}
}
// Handle reset_stored_input: expects a boolean value
// Note: This only resets the name setting. The stored GPU tensor
// remains in memory until overwritten or destroy() is called.
else if (strcmp(option.key, "reset_stored_input") == 0) {
if (auto* val = std::get_if<bool>(&option.value)) {
if (*val) {
use_stored_input_name_.clear();
}
} else {
ET_LOG(Warning, "reset_stored_input option expects a boolean value");
return Error::InvalidArgument;
}
}
}
return Error::Ok;
}

// Once per loaded binary blob
Result<DelegateHandle*> init(
BackendInitContext& context,
Expand Down Expand Up @@ -222,15 +306,48 @@ class ET_EXPERIMENTAL CudaBackend final
std::vector<AOTITensorHandle> gpu_outputs(
n_outputs); // GPU tensors for kernel output

// RAII helper to ensure GPU tensors are cleaned up on all exit paths.
// Prevents memory leaks when errors occur during execute().
struct TensorCleanup {
std::vector<AOTITensorHandle>& inputs;
std::vector<AOTITensorHandle>& outputs;
const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;

~TensorCleanup() {
// Clean up input tensors
for (auto* handle : inputs) {
if (handle != nullptr) {
aoti_torch_delete_tensor_object(handle);
}
}
// Clean up output tensors, except those that are stored
for (auto* handle : outputs) {
if (handle != nullptr) {
bool is_stored = false;
for (const auto& pair : stored_tensors) {
if (pair.second.handle == handle) {
is_stored = true;
break;
}
}
if (!is_stored) {
aoti_torch_delete_tensor_object(handle);
}
}
}
}
};
TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};

// Process input tensors: ExecuTorch provides CPU tensors, create GPU
// copies
// copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
for (int i = 0; i < n_inputs; i++) {
// Get tensor dimensions and properties from ExecuTorch CPU tensor
auto cpu_tensor = &(args[i]->toTensor());
auto sizes = cpu_tensor->sizes();
auto scalar_type = cpu_tensor->scalar_type();

// Create GPU tensor with same shape
// Create GPU tensor with same shape (always needed for AOTI format)
std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());

AOTITensorHandle gpu_input_handle;
Expand All @@ -251,7 +368,48 @@ class ET_EXPERIMENTAL CudaBackend final

gpu_inputs[i] = gpu_input_handle;

// Copy data from CPU to GPU
// Check if this input matches a stored GPU tensor (by size).
// Note: Size-based matching assumes only one input will match. If
// multiple inputs have the same byte size as the stored tensor, the first
// match wins.
if (!use_stored_input_name_.empty()) {
auto it = gpu_tensors_.find(use_stored_input_name_);
if (it != gpu_tensors_.end()) {
const GpuTensorRef& ref = it->second;
size_t numel = gpu_inputs[i]->numel();
size_t elem_size = gpu_inputs[i]->element_size();
size_t copy_bytes = numel * elem_size;

// Match by size: use stored tensor if sizes match
if (copy_bytes == ref.size_bytes) {
ET_LOG(
Debug,
"Using stored tensor '%s' for input %d (%zu bytes, D2D copy)",
use_stored_input_name_.c_str(),
i,
copy_bytes);

// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
cudaError_t cuda_err = cudaMemcpy(
gpu_inputs[i]->data_ptr(),
ref.data_ptr,
copy_bytes,
cudaMemcpyDeviceToDevice);

ET_CHECK_OR_RETURN_ERROR(
cuda_err == cudaSuccess,
Internal,
"Failed GPU-to-GPU copy for input %d: %s",
i,
cudaGetErrorString(cuda_err));

// Skip the CPU-to-GPU copy below
continue;
}
}
}

// Copy data from CPU to GPU (normal path)
ET_CHECK_OR_RETURN_ERROR(
aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
Internal,
Expand Down Expand Up @@ -303,6 +461,41 @@ class ET_EXPERIMENTAL CudaBackend final
"AOTInductorModelContainerRun failed with error code %d",
error);

// Store reference to output GPU tensor if requested.
// The tensor will be kept alive for later D2D copy to decoder inputs.
if (!store_output_name_.empty()) {
ET_CHECK_OR_RETURN_ERROR(
n_outputs == 1,
InvalidArgument,
"store_output only supports single-output methods, got %zu outputs",
n_outputs);

auto* gpu_tensor = gpu_outputs[0];
size_t numel = gpu_tensor->numel();
size_t elem_size = gpu_tensor->element_size();
size_t size_bytes = numel * elem_size;

// Delete old tensor if overwriting (erase first to prevent double-free)
auto old_it = gpu_tensors_.find(store_output_name_);
if (old_it != gpu_tensors_.end()) {
AOTITensorHandle old_handle = old_it->second.handle;
gpu_tensors_.erase(old_it); // Remove from map before deleting
if (old_handle != nullptr) {
aoti_torch_delete_tensor_object(old_handle);
}
}

// Store tensor reference (we now own this tensor)
GpuTensorRef ref;
ref.handle = gpu_tensor;
ref.data_ptr = gpu_tensor->data_ptr();
ref.size_bytes = size_bytes;
gpu_tensors_[store_output_name_] = ref;

// Reset store_output name after storing
store_output_name_.clear();
}

// Copy GPU output results back to CPU output tensors
for (int i = 0; i < n_outputs; i++) {
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
Expand All @@ -317,6 +510,16 @@ class ET_EXPERIMENTAL CudaBackend final
i);
}

// Memory management notes:
// - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
// - use_stored_input setting persists across execute() calls to support
// decoder loops that reuse the stored encoder output.
// - Stored GPU tensors (in gpu_tensors_) remain in memory until:
// (a) overwritten by a new tensor with the same name, or
// (b) destroy() is called, which frees all stored tensors.
// - The "reset_stored_input" option only resets the input name setting,
// NOT the stored GPU tensors themselves.

return Error::Ok;
}

Expand All @@ -326,6 +529,9 @@ class ET_EXPERIMENTAL CudaBackend final
}
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;

// Delete stored GPU tensors
clear_gpu_tensors();

// Destroy the CUDA stream if it exists
if (handle->cuda_stream != nullptr) {
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
Expand Down
42 changes: 42 additions & 0 deletions extension/asr/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <executorch/extension/llm/runner/util.h>
#include <executorch/extension/llm/sampler/util.h>
#include <executorch/extension/tensor/tensor_ptr_maker.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/backend/options.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/platform/assert.h>
#include <executorch/runtime/platform/log.h>
Expand Down Expand Up @@ -196,6 +198,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
}
}

// Tell CUDA backend to store encoder output as "encoder_output"
{
::executorch::runtime::BackendOptions<1> opts;
opts.set_option("store_output", "encoder_output");
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
if (err != ::executorch::runtime::Error::Ok) {
ET_LOG(
Warning,
"Failed to set store_output option (backend may not support storage)");
}
}

auto encoder_result =
module_->execute(kEncoderMethodName, preprocessed_features);
ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
Expand Down Expand Up @@ -249,6 +263,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
decoder_inputs.emplace_back(decoder_input_ptr);
decoder_inputs.emplace_back(encoder_output_ptr);
decoder_inputs.emplace_back(cache_position_ptr);

// Tell CUDA backend to use stored encoder output for matching decoder inputs.
// The backend matches by tensor size, avoiding redundant CPU->GPU copies.
{
::executorch::runtime::BackendOptions<1> opts;
opts.set_option("use_stored_input", "encoder_output");
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
if (err != ::executorch::runtime::Error::Ok) {
ET_LOG(
Warning,
"Failed to set use_stored_input option (backend may not support storage)");
}
}

// Add some green coloring for the first generated token
// token_callback("\033[1;32m");
while (generated_tokens < config.max_new_tokens) {
Expand Down Expand Up @@ -304,6 +332,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
break;
}
}

// Reset stored input settings after decoder loop completes.
// This disables the D2D copy optimization for subsequent execute() calls.
// Note: The stored GPU tensor remains in memory until the next encoder run
// (which overwrites it) or until the backend is destroyed.
{
::executorch::runtime::BackendOptions<1> opts;
opts.set_option("reset_stored_input", true);
auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
if (err != ::executorch::runtime::Error::Ok) {
ET_LOG(Warning, "Failed to set reset_stored_input option");
}
}

// Reset coloring
// token_callback("\033[0m");
// Update stats and print report
Expand Down
Loading