Skip to content

Commit 03d27e7

Browse files
committed
Address Copilot comments
1 parent ffbfbe7 commit 03d27e7

File tree

2 files changed

+128
-58
lines changed

2 files changed

+128
-58
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 123 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <executorch/runtime/core/error.h>
1313
#include <executorch/runtime/core/evalue.h>
1414
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
15+
#include <climits>
1516
#include <cstdio>
1617

1718
#include <filesystem>
@@ -56,18 +57,40 @@ struct GpuTensorRef {
5657
size_t size_bytes; // Total size in bytes
5758
};
5859

59-
// Global map of named GPU tensor references.
60-
// Note: NOT thread-safe. Callers must ensure execute() is called from a single thread.
61-
static std::unordered_map<std::string, GpuTensorRef> g_gpu_tensors;
60+
// Parses "slot:name" format string. Returns true on success.
61+
// Uses character-by-character parsing to avoid std::stoi exceptions.
62+
static bool parse_slot_name(
63+
const std::string& val,
64+
int& out_slot,
65+
std::string& out_name) {
66+
auto colon_pos = val.find(':');
67+
if (colon_pos == std::string::npos || colon_pos == 0) {
68+
return false;
69+
}
6270

63-
// Helper to clear stored GPU tensors and free their memory
64-
static void clear_gpu_tensors() {
65-
for (auto& pair : g_gpu_tensors) {
66-
if (pair.second.handle != nullptr) {
67-
aoti_torch_delete_tensor_object(pair.second.handle);
71+
// Parse slot number manually to avoid exceptions
72+
int slot = 0;
73+
for (size_t i = 0; i < colon_pos; i++) {
74+
char c = val[i];
75+
if (c < '0' || c > '9') {
76+
return false; // Non-digit character
77+
}
78+
int digit = c - '0';
79+
// Check for overflow
80+
if (slot > (INT_MAX - digit) / 10) {
81+
return false;
6882
}
83+
slot = slot * 10 + digit;
6984
}
70-
g_gpu_tensors.clear();
85+
86+
std::string name = val.substr(colon_pos + 1);
87+
if (name.empty()) {
88+
return false;
89+
}
90+
91+
out_slot = slot;
92+
out_name = std::move(name);
93+
return true;
7194
}
7295

7396
class ET_EXPERIMENTAL CudaBackend final
@@ -79,6 +102,29 @@ class ET_EXPERIMENTAL CudaBackend final
79102
mutable int use_cache_input_slot_ = -1; // Which input slot to use cache for (-1 = none)
80103
mutable std::string use_cache_input_name_; // Name of cached tensor to use
81104

105+
// Per-instance map of named GPU tensor references.
106+
// Mutable because execute() is const but needs to modify this.
107+
//
108+
// LIFETIME CONTRACT:
109+
// - Stored tensors are valid until overwritten or destroy() is called.
110+
// - Caller must ensure the producing execute() call (e.g., encoder) completes
111+
// before any consuming execute() call (e.g., decoder) begins.
112+
// - Caller must not call destroy() while execute() is in progress.
113+
// - Overwriting a tensor (same cache name) deletes the old tensor immediately,
114+
// so caller must ensure no concurrent execute() is using it.
115+
mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
116+
117+
// Helper to clear stored GPU tensors and free their memory.
118+
// Only call when no execute() is in progress.
119+
void clear_gpu_tensors() const {
120+
for (auto& pair : gpu_tensors_) {
121+
if (pair.second.handle != nullptr) {
122+
aoti_torch_delete_tensor_object(pair.second.handle);
123+
}
124+
}
125+
gpu_tensors_.clear();
126+
}
127+
82128
Error load_function_pointers_into_handle(
83129
void* so_handle,
84130
AOTIDelegateHandle* handle) const {
@@ -133,46 +179,38 @@ class ET_EXPERIMENTAL CudaBackend final
133179
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
134180
&option.value)) {
135181
std::string val(arr->data());
136-
auto colon_pos = val.find(':');
137-
if (colon_pos != std::string::npos) {
138-
try {
139-
cache_output_slot_ = std::stoi(val.substr(0, colon_pos));
140-
cache_output_name_ = val.substr(colon_pos + 1);
141-
} catch (const std::exception& e) {
142-
ET_LOG(
143-
Error,
144-
"Invalid cache_output format '%s': %s",
145-
val.c_str(),
146-
e.what());
147-
return Error::InvalidArgument;
148-
}
182+
int slot;
183+
std::string name;
184+
if (parse_slot_name(val, slot, name)) {
185+
cache_output_slot_ = slot;
186+
cache_output_name_ = std::move(name);
187+
} else {
188+
ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str());
189+
return Error::InvalidArgument;
149190
}
150191
}
151192
}
152-
// Handle use_cache_input: "slot:name" format (e.g., "1:encoder_output")
193+
// Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output")
153194
else if (strcmp(option.key, "use_cache_input") == 0) {
154195
if (auto* arr = std::get_if<
155196
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
156197
&option.value)) {
157198
std::string val(arr->data());
158-
auto colon_pos = val.find(':');
159-
if (colon_pos != std::string::npos) {
160-
try {
161-
use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos));
162-
use_cache_input_name_ = val.substr(colon_pos + 1);
163-
} catch (const std::exception& e) {
164-
ET_LOG(
165-
Error,
166-
"Invalid use_cache_input format '%s': %s",
167-
val.c_str(),
168-
e.what());
169-
return Error::InvalidArgument;
170-
}
199+
int slot;
200+
std::string name;
201+
if (parse_slot_name(val, slot, name)) {
202+
use_cache_input_slot_ = slot;
203+
use_cache_input_name_ = std::move(name);
204+
} else {
205+
ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str());
206+
return Error::InvalidArgument;
171207
}
172208
}
173209
}
174-
// Handle clear_cache_input: reset input cache settings
175-
else if (strcmp(option.key, "clear_cache_input") == 0) {
210+
// Handle reset_cache_input: disable cache input for subsequent execute() calls.
211+
// Note: This only resets the slot/name settings. The stored GPU tensor
212+
// remains in memory until overwritten or destroy() is called.
213+
else if (strcmp(option.key, "reset_cache_input") == 0) {
176214
if (auto* val = std::get_if<bool>(&option.value)) {
177215
if (*val) {
178216
use_cache_input_slot_ = -1;
@@ -308,6 +346,28 @@ class ET_EXPERIMENTAL CudaBackend final
308346
n_outputs,
309347
args.size())
310348

349+
// Validate cache slot indices if set
350+
if (use_cache_input_slot_ >= 0 &&
351+
use_cache_input_slot_ >= static_cast<int>(n_inputs)) {
352+
ET_LOG(
353+
Warning,
354+
"use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring",
355+
use_cache_input_slot_,
356+
n_inputs);
357+
use_cache_input_slot_ = -1;
358+
use_cache_input_name_.clear();
359+
}
360+
if (cache_output_slot_ >= 0 &&
361+
cache_output_slot_ >= static_cast<int>(n_outputs)) {
362+
ET_LOG(
363+
Warning,
364+
"cache_output slot %d is out of bounds (n_outputs=%zu), ignoring",
365+
cache_output_slot_,
366+
n_outputs);
367+
cache_output_slot_ = -1;
368+
cache_output_name_.clear();
369+
}
370+
311371
// NOTE: ExecuTorch tensors are always on CPU/host memory
312372
// We need to create GPU copies for CUDA kernel execution
313373
std::vector<AOTITensorHandle> gpu_inputs(
@@ -346,8 +406,8 @@ class ET_EXPERIMENTAL CudaBackend final
346406

347407
// Check if this input slot should use a stored GPU tensor
348408
if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
349-
auto it = g_gpu_tensors.find(use_cache_input_name_);
350-
if (it != g_gpu_tensors.end()) {
409+
auto it = gpu_tensors_.find(use_cache_input_name_);
410+
if (it != gpu_tensors_.end()) {
351411
const GpuTensorRef& ref = it->second;
352412
// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
353413
size_t numel = gpu_inputs[i]->numel();
@@ -358,8 +418,8 @@ class ET_EXPERIMENTAL CudaBackend final
358418
copy_bytes == ref.size_bytes,
359419
Internal,
360420
"Stored tensor size mismatch: expected %zu bytes, got %zu",
361-
copy_bytes,
362-
ref.size_bytes);
421+
ref.size_bytes,
422+
copy_bytes);
363423

364424
cudaError_t cuda_err = cudaMemcpy(
365425
gpu_inputs[i]->data_ptr(),
@@ -434,25 +494,29 @@ class ET_EXPERIMENTAL CudaBackend final
434494

435495
// Store reference to output GPU tensor if requested.
436496
// The tensor will be kept alive for later D2D copy to decoder inputs.
437-
if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast<int>(n_outputs) &&
438-
!cache_output_name_.empty()) {
497+
// (Bounds already validated at start of execute())
498+
if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) {
439499
auto* gpu_tensor = gpu_outputs[cache_output_slot_];
440500
size_t numel = gpu_tensor->numel();
441501
size_t elem_size = gpu_tensor->element_size();
442502
size_t size_bytes = numel * elem_size;
443503

444-
// Delete old tensor if overwriting
445-
auto old_it = g_gpu_tensors.find(cache_output_name_);
446-
if (old_it != g_gpu_tensors.end() && old_it->second.handle != nullptr) {
447-
aoti_torch_delete_tensor_object(old_it->second.handle);
504+
// Delete old tensor if overwriting (erase first to prevent double-free)
505+
auto old_it = gpu_tensors_.find(cache_output_name_);
506+
if (old_it != gpu_tensors_.end()) {
507+
AOTITensorHandle old_handle = old_it->second.handle;
508+
gpu_tensors_.erase(old_it); // Remove from map before deleting
509+
if (old_handle != nullptr) {
510+
aoti_torch_delete_tensor_object(old_handle);
511+
}
448512
}
449513

450514
// Store tensor reference (we now own this tensor)
451515
GpuTensorRef ref;
452516
ref.handle = gpu_tensor;
453517
ref.data_ptr = gpu_tensor->data_ptr();
454518
ref.size_bytes = size_bytes;
455-
g_gpu_tensors[cache_output_name_] = ref;
519+
gpu_tensors_[cache_output_name_] = ref;
456520

457521
// Reset cache_output settings after caching
458522
cache_output_slot_ = -1;
@@ -473,21 +537,25 @@ class ET_EXPERIMENTAL CudaBackend final
473537
i);
474538
}
475539

476-
// Note: use_cache_input settings are intentionally NOT reset here.
477-
// They persist across execute() calls to support decoder loops that
478-
// reuse cached encoder output. The caller should explicitly clear
479-
// these settings using the "clear_cache_input" option when done.
540+
// Memory management notes:
541+
// - use_cache_input settings persist across execute() calls to support
542+
// decoder loops that reuse the stored encoder output.
543+
// - Stored GPU tensors (in gpu_tensors_) remain in memory until:
544+
// (a) overwritten by a new tensor with the same name, or
545+
// (b) destroy() is called, which frees all stored tensors.
546+
// - The "reset_cache_input" option only resets the input slot/name settings,
547+
// NOT the stored GPU tensors themselves.
480548

481549
// Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
482550
// Input tensors are no longer needed after AOTI execution.
483551
for (size_t i = 0; i < n_inputs; i++) {
484552
aoti_torch_delete_tensor_object(gpu_inputs[i]);
485553
}
486554
// Output tensors are no longer needed after copying to CPU,
487-
// EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy).
555+
// EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
488556
for (size_t i = 0; i < n_outputs; i++) {
489557
bool is_stored = false;
490-
for (const auto& pair : g_gpu_tensors) {
558+
for (const auto& pair : gpu_tensors_) {
491559
if (pair.second.handle == gpu_outputs[i]) {
492560
is_stored = true;
493561
break;

extension/asr/runner/runner.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -338,11 +338,13 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
338338
}
339339
}
340340

341-
// Clear cache input settings after decoder loop completes
342-
// This prevents stale cache from being used in subsequent transcribe() calls
341+
// Reset cache input settings after decoder loop completes.
342+
// This disables the D2D copy optimization for subsequent execute() calls.
343+
// Note: The stored GPU tensor remains in memory until the next encoder run
344+
// (which overwrites it) or until the backend is destroyed.
343345
{
344346
::executorch::runtime::BackendOptions<1> opts;
345-
opts.set_option("clear_cache_input", true);
347+
opts.set_option("reset_cache_input", true);
346348
::executorch::runtime::set_option("CudaBackend", opts.view());
347349
}
348350

0 commit comments

Comments
 (0)