Skip to content

Commit e535aee

Browse files
committed
More simplification
1 parent 03d27e7 commit e535aee

File tree

2 files changed

+68
-147
lines changed

2 files changed

+68
-147
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 54 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -57,50 +57,12 @@ struct GpuTensorRef {
5757
size_t size_bytes; // Total size in bytes
5858
};
5959

60-
// Parses "slot:name" format string. Returns true on success.
61-
// Uses character-by-character parsing to avoid std::stoi exceptions.
62-
static bool parse_slot_name(
63-
const std::string& val,
64-
int& out_slot,
65-
std::string& out_name) {
66-
auto colon_pos = val.find(':');
67-
if (colon_pos == std::string::npos || colon_pos == 0) {
68-
return false;
69-
}
70-
71-
// Parse slot number manually to avoid exceptions
72-
int slot = 0;
73-
for (size_t i = 0; i < colon_pos; i++) {
74-
char c = val[i];
75-
if (c < '0' || c > '9') {
76-
return false; // Non-digit character
77-
}
78-
int digit = c - '0';
79-
// Check for overflow
80-
if (slot > (INT_MAX - digit) / 10) {
81-
return false;
82-
}
83-
slot = slot * 10 + digit;
84-
}
85-
86-
std::string name = val.substr(colon_pos + 1);
87-
if (name.empty()) {
88-
return false;
89-
}
90-
91-
out_slot = slot;
92-
out_name = std::move(name);
93-
return true;
94-
}
95-
9660
class ET_EXPERIMENTAL CudaBackend final
9761
: public ::executorch::runtime::BackendInterface {
9862
private:
99-
// Cache control options (set via set_option before execute)
100-
mutable int cache_output_slot_ = -1; // Which output slot to cache (-1 = none)
101-
mutable std::string cache_output_name_; // Name to cache output under
102-
mutable int use_cache_input_slot_ = -1; // Which input slot to use cache for (-1 = none)
103-
mutable std::string use_cache_input_name_; // Name of cached tensor to use
63+
// Storage control options (set via set_option before execute)
64+
mutable std::string store_output_name_; // Name to store output under (empty = none)
65+
mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
10466

10567
// Per-instance map of named GPU tensor references.
10668
// Mutable because execute() is const but needs to modify this.
@@ -110,7 +72,7 @@ class ET_EXPERIMENTAL CudaBackend final
11072
// - Caller must ensure the producing execute() call (e.g., encoder) completes
11173
// before any consuming execute() call (e.g., decoder) begins.
11274
// - Caller must not call destroy() while execute() is in progress.
113-
// - Overwriting a tensor (same cache name) deletes the old tensor immediately,
75+
// - Overwriting a tensor (same name) deletes the old tensor immediately,
11476
// so caller must ensure no concurrent execute() is using it.
11577
mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
11678

@@ -173,49 +135,39 @@ class ET_EXPERIMENTAL CudaBackend final
173135
backend_options) override {
174136
for (size_t i = 0; i < backend_options.size(); i++) {
175137
const auto& option = backend_options[i];
176-
// Handle cache_output: "slot:name" format (e.g., "0:encoder_output")
177-
if (strcmp(option.key, "cache_output") == 0) {
138+
// Handle store_output: expects a string name (e.g., "encoder_output")
139+
if (strcmp(option.key, "store_output") == 0) {
178140
if (auto* arr = std::get_if<
179141
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
180142
&option.value)) {
181-
std::string val(arr->data());
182-
int slot;
183-
std::string name;
184-
if (parse_slot_name(val, slot, name)) {
185-
cache_output_slot_ = slot;
186-
cache_output_name_ = std::move(name);
187-
} else {
188-
ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str());
189-
return Error::InvalidArgument;
190-
}
143+
store_output_name_ = std::string(arr->data());
144+
} else {
145+
ET_LOG(Warning, "store_output option expects a string value");
146+
return Error::InvalidArgument;
191147
}
192148
}
193-
// Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output")
194-
else if (strcmp(option.key, "use_cache_input") == 0) {
149+
// Handle use_stored_input: expects a string name (e.g., "encoder_output")
150+
else if (strcmp(option.key, "use_stored_input") == 0) {
195151
if (auto* arr = std::get_if<
196152
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
197153
&option.value)) {
198-
std::string val(arr->data());
199-
int slot;
200-
std::string name;
201-
if (parse_slot_name(val, slot, name)) {
202-
use_cache_input_slot_ = slot;
203-
use_cache_input_name_ = std::move(name);
204-
} else {
205-
ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str());
206-
return Error::InvalidArgument;
207-
}
154+
use_stored_input_name_ = std::string(arr->data());
155+
} else {
156+
ET_LOG(Warning, "use_stored_input option expects a string value");
157+
return Error::InvalidArgument;
208158
}
209159
}
210-
// Handle reset_cache_input: disable cache input for subsequent execute() calls.
211-
// Note: This only resets the slot/name settings. The stored GPU tensor
160+
// Handle reset_stored_input: expects a boolean value
161+
// Note: This only resets the name setting. The stored GPU tensor
212162
// remains in memory until overwritten or destroy() is called.
213-
else if (strcmp(option.key, "reset_cache_input") == 0) {
163+
else if (strcmp(option.key, "reset_stored_input") == 0) {
214164
if (auto* val = std::get_if<bool>(&option.value)) {
215165
if (*val) {
216-
use_cache_input_slot_ = -1;
217-
use_cache_input_name_.clear();
166+
use_stored_input_name_.clear();
218167
}
168+
} else {
169+
ET_LOG(Warning, "reset_stored_input option expects a boolean value");
170+
return Error::InvalidArgument;
219171
}
220172
}
221173
}
@@ -346,28 +298,6 @@ class ET_EXPERIMENTAL CudaBackend final
346298
n_outputs,
347299
args.size())
348300

349-
// Validate cache slot indices if set
350-
if (use_cache_input_slot_ >= 0 &&
351-
use_cache_input_slot_ >= static_cast<int>(n_inputs)) {
352-
ET_LOG(
353-
Warning,
354-
"use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring",
355-
use_cache_input_slot_,
356-
n_inputs);
357-
use_cache_input_slot_ = -1;
358-
use_cache_input_name_.clear();
359-
}
360-
if (cache_output_slot_ >= 0 &&
361-
cache_output_slot_ >= static_cast<int>(n_outputs)) {
362-
ET_LOG(
363-
Warning,
364-
"cache_output slot %d is out of bounds (n_outputs=%zu), ignoring",
365-
cache_output_slot_,
366-
n_outputs);
367-
cache_output_slot_ = -1;
368-
cache_output_name_.clear();
369-
}
370-
371301
// NOTE: ExecuTorch tensors are always on CPU/host memory
372302
// We need to create GPU copies for CUDA kernel execution
373303
std::vector<AOTITensorHandle> gpu_inputs(
@@ -404,40 +334,35 @@ class ET_EXPERIMENTAL CudaBackend final
404334

405335
gpu_inputs[i] = gpu_input_handle;
406336

407-
// Check if this input slot should use a stored GPU tensor
408-
if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
409-
auto it = gpu_tensors_.find(use_cache_input_name_);
337+
// Check if this input matches a stored GPU tensor (by size)
338+
if (!use_stored_input_name_.empty()) {
339+
auto it = gpu_tensors_.find(use_stored_input_name_);
410340
if (it != gpu_tensors_.end()) {
411341
const GpuTensorRef& ref = it->second;
412-
// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
413342
size_t numel = gpu_inputs[i]->numel();
414343
size_t elem_size = gpu_inputs[i]->element_size();
415344
size_t copy_bytes = numel * elem_size;
416345

417-
ET_CHECK_OR_RETURN_ERROR(
418-
copy_bytes == ref.size_bytes,
419-
Internal,
420-
"Stored tensor size mismatch: expected %zu bytes, got %zu",
421-
ref.size_bytes,
422-
copy_bytes);
423-
424-
cudaError_t cuda_err = cudaMemcpy(
425-
gpu_inputs[i]->data_ptr(),
426-
ref.data_ptr,
427-
copy_bytes,
428-
cudaMemcpyDeviceToDevice);
429-
430-
ET_CHECK_OR_RETURN_ERROR(
431-
cuda_err == cudaSuccess,
432-
Internal,
433-
"Failed GPU-to-GPU copy for input %d: %s",
434-
i,
435-
cudaGetErrorString(cuda_err));
436-
437-
// Skip the CPU-to-GPU copy below
438-
continue;
346+
// Match by size: use stored tensor if sizes match
347+
if (copy_bytes == ref.size_bytes) {
348+
// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
349+
cudaError_t cuda_err = cudaMemcpy(
350+
gpu_inputs[i]->data_ptr(),
351+
ref.data_ptr,
352+
copy_bytes,
353+
cudaMemcpyDeviceToDevice);
354+
355+
ET_CHECK_OR_RETURN_ERROR(
356+
cuda_err == cudaSuccess,
357+
Internal,
358+
"Failed GPU-to-GPU copy for input %d: %s",
359+
i,
360+
cudaGetErrorString(cuda_err));
361+
362+
// Skip the CPU-to-GPU copy below
363+
continue;
364+
}
439365
}
440-
// Not found: fall through to normal CPU-to-GPU copy
441366
}
442367

443368
// Copy data from CPU to GPU (normal path)
@@ -493,16 +418,16 @@ class ET_EXPERIMENTAL CudaBackend final
493418
error);
494419

495420
// Store reference to output GPU tensor if requested.
421+
// Always uses gpu_outputs[0] (encoder has single output).
496422
// The tensor will be kept alive for later D2D copy to decoder inputs.
497-
// (Bounds already validated at start of execute())
498-
if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) {
499-
auto* gpu_tensor = gpu_outputs[cache_output_slot_];
423+
if (!store_output_name_.empty() && n_outputs > 0) {
424+
auto* gpu_tensor = gpu_outputs[0];
500425
size_t numel = gpu_tensor->numel();
501426
size_t elem_size = gpu_tensor->element_size();
502427
size_t size_bytes = numel * elem_size;
503428

504429
// Delete old tensor if overwriting (erase first to prevent double-free)
505-
auto old_it = gpu_tensors_.find(cache_output_name_);
430+
auto old_it = gpu_tensors_.find(store_output_name_);
506431
if (old_it != gpu_tensors_.end()) {
507432
AOTITensorHandle old_handle = old_it->second.handle;
508433
gpu_tensors_.erase(old_it); // Remove from map before deleting
@@ -516,11 +441,10 @@ class ET_EXPERIMENTAL CudaBackend final
516441
ref.handle = gpu_tensor;
517442
ref.data_ptr = gpu_tensor->data_ptr();
518443
ref.size_bytes = size_bytes;
519-
gpu_tensors_[cache_output_name_] = ref;
444+
gpu_tensors_[store_output_name_] = ref;
520445

521-
// Reset cache_output settings after caching
522-
cache_output_slot_ = -1;
523-
cache_output_name_.clear();
446+
// Reset store_output name after storing
447+
store_output_name_.clear();
524448
}
525449

526450
// Copy GPU output results back to CPU output tensors
@@ -538,12 +462,12 @@ class ET_EXPERIMENTAL CudaBackend final
538462
}
539463

540464
// Memory management notes:
541-
// - use_cache_input settings persist across execute() calls to support
465+
// - use_stored_input setting persists across execute() calls to support
542466
// decoder loops that reuse the stored encoder output.
543467
// - Stored GPU tensors (in gpu_tensors_) remain in memory until:
544468
// (a) overwritten by a new tensor with the same name, or
545469
// (b) destroy() is called, which frees all stored tensors.
546-
// - The "reset_cache_input" option only resets the input slot/name settings,
470+
// - The "reset_stored_input" option only resets the input name setting,
547471
// NOT the stored GPU tensors themselves.
548472

549473
// Cleanup: delete GPU tensors to avoid memory leak across execute() calls.

extension/asr/runner/runner.cpp

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,14 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
198198
}
199199
}
200200

201-
// Tell CUDA backend to cache encoder output (slot 0) as "encoder_output"
201+
// Tell CUDA backend to store encoder output as "encoder_output"
202202
{
203203
::executorch::runtime::BackendOptions<1> opts;
204-
opts.set_option("cache_output", "0:encoder_output");
204+
opts.set_option("store_output", "encoder_output");
205205
auto err =
206206
::executorch::runtime::set_option("CudaBackend", opts.view());
207207
if (err != ::executorch::runtime::Error::Ok) {
208-
ET_LOG(Info, "Failed to set cache_output option (backend may not support caching)");
208+
ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)");
209209
}
210210
}
211211

@@ -263,22 +263,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
263263
decoder_inputs.emplace_back(encoder_output_ptr);
264264
decoder_inputs.emplace_back(cache_position_ptr);
265265

266-
// Tell CUDA backend to use cached encoder output for decoder input slot 2.
267-
//
268-
// Why slot 2? The AOTI-compiled decoder receives inputs in a different order
269-
// than we pass them in decoder_inputs above. The AOTI input order was
270-
// determined empirically by examining tensor shapes during execution.
271-
//
272-
// The "2:encoder_output" format tells the backend to use the stored GPU
273-
// tensor named "encoder_output" for AOTI input slot 2. This avoids redundant
274-
// CPU->GPU copies on each decoder iteration.
266+
// Tell CUDA backend to use stored encoder output for matching decoder inputs.
267+
// The backend matches by tensor size, avoiding redundant CPU->GPU copies.
275268
{
276269
::executorch::runtime::BackendOptions<1> opts;
277-
opts.set_option("use_cache_input", "2:encoder_output");
270+
opts.set_option("use_stored_input", "encoder_output");
278271
auto err =
279272
::executorch::runtime::set_option("CudaBackend", opts.view());
280273
if (err != ::executorch::runtime::Error::Ok) {
281-
ET_LOG(Info, "Failed to set use_cache_input option (backend may not support caching)");
274+
ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)");
282275
}
283276
}
284277

@@ -338,14 +331,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
338331
}
339332
}
340333

341-
// Reset cache input settings after decoder loop completes.
334+
// Reset stored input settings after decoder loop completes.
342335
// This disables the D2D copy optimization for subsequent execute() calls.
343336
// Note: The stored GPU tensor remains in memory until the next encoder run
344337
// (which overwrites it) or until the backend is destroyed.
345338
{
346339
::executorch::runtime::BackendOptions<1> opts;
347-
opts.set_option("reset_cache_input", true);
348-
::executorch::runtime::set_option("CudaBackend", opts.view());
340+
opts.set_option("reset_stored_input", true);
341+
auto err =
342+
::executorch::runtime::set_option("CudaBackend", opts.view());
343+
if (err != ::executorch::runtime::Error::Ok) {
344+
ET_LOG(Warning, "Failed to set reset_stored_input option");
345+
}
349346
}
350347

351348
// Reset coloring

0 commit comments

Comments
 (0)