@@ -185,7 +185,7 @@ class ET_EXPERIMENTAL CudaBackend final
185185 &option.value )) {
186186 store_output_name_ = std::string (arr->data ());
187187 } else {
188- ET_LOG (Warning , " store_output option expects a string value" );
188+ ET_LOG (Error , " store_output option expects a string value" );
189189 return Error::InvalidArgument;
190190 }
191191 }
@@ -196,7 +196,7 @@ class ET_EXPERIMENTAL CudaBackend final
196196 &option.value )) {
197197 use_stored_input_name_ = std::string (arr->data ());
198198 } else {
199- ET_LOG (Warning , " use_stored_input option expects a string value" );
199+ ET_LOG (Error , " use_stored_input option expects a string value" );
200200 return Error::InvalidArgument;
201201 }
202202 }
@@ -209,7 +209,7 @@ class ET_EXPERIMENTAL CudaBackend final
209209 use_stored_input_name_.clear ();
210210 }
211211 } else {
212- ET_LOG (Warning , " reset_stored_input option expects a boolean value" );
212+ ET_LOG (Error , " reset_stored_input option expects a boolean value" );
213213 return Error::InvalidArgument;
214214 }
215215 }
@@ -228,7 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final
228228 gpu_tensors_.erase (it);
229229 }
230230 } else {
231- ET_LOG (Warning , " clear_stored_tensor option expects a string value" );
231+ ET_LOG (Error , " clear_stored_tensor option expects a string value" );
232232 return Error::InvalidArgument;
233233 }
234234 }
@@ -400,6 +400,10 @@ class ET_EXPERIMENTAL CudaBackend final
400400 };
401401 TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
402402
403+ // Track which input index was matched for D2D copy (for duplicate
404+ // detection)
405+ ssize_t matched_input_idx = -1 ;
406+
403407 // Process input tensors: ExecuTorch provides CPU tensors, create GPU
404408 // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
405409 for (size_t i = 0 ; i < n_inputs; i++) {
@@ -424,15 +428,12 @@ class ET_EXPERIMENTAL CudaBackend final
424428 ET_CHECK_OR_RETURN_ERROR (
425429 create_err == Error::Ok,
426430 Internal,
427- " Failed to create GPU tensor for input %d " ,
431+ " Failed to create GPU tensor for input %zu " ,
428432 i);
429433
430434 gpu_inputs[i] = gpu_input_handle;
431435
432436 // Check if this input matches a stored GPU tensor (by size).
433- // Note: Size-based matching assumes only one input will match. If
434- // multiple inputs have the same byte size as the stored tensor, the first
435- // match wins.
436437 if (!use_stored_input_name_.empty ()) {
437438 auto it = gpu_tensors_.find (use_stored_input_name_);
438439 if (it != gpu_tensors_.end ()) {
@@ -443,29 +444,45 @@ class ET_EXPERIMENTAL CudaBackend final
443444
444445 // Match by size: use stored tensor if sizes match
445446 if (copy_bytes == ref.size_bytes ) {
446- ET_LOG (
447- Debug,
448- " Using stored tensor '%s' for input %d (%zu bytes, D2D copy)" ,
449- use_stored_input_name_.c_str (),
450- i,
451- copy_bytes);
452-
453- // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
454- cudaError_t cuda_err = cudaMemcpy (
455- gpu_inputs[i]->data_ptr (),
456- ref.data_ptr ,
457- copy_bytes,
458- cudaMemcpyDeviceToDevice);
459-
460- ET_CHECK_OR_RETURN_ERROR (
461- cuda_err == cudaSuccess,
462- Internal,
463- " Failed GPU-to-GPU copy for input %d: %s" ,
464- i,
465- cudaGetErrorString (cuda_err));
466-
467- // Skip the CPU-to-GPU copy below
468- continue ;
447+ if (matched_input_idx >= 0 ) {
448+ // Another input already matched - warn about ambiguity
449+ ET_LOG (
450+ Error,
451+ " Multiple inputs match stored tensor '%s' size (%zu bytes): "
452+ " input %zd was used, input %zu also matches. "
453+ " Consider using unique tensor sizes or a different matching strategy." ,
454+ use_stored_input_name_.c_str (),
455+ copy_bytes,
456+ matched_input_idx,
457+ i);
458+ } else {
459+ // First match - perform D2D copy
460+ matched_input_idx = static_cast <ssize_t >(i);
461+
462+ ET_LOG (
463+ Debug,
464+ " Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)" ,
465+ use_stored_input_name_.c_str (),
466+ i,
467+ copy_bytes);
468+
469+ // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
470+ cudaError_t cuda_err = cudaMemcpy (
471+ gpu_inputs[i]->data_ptr (),
472+ ref.data_ptr ,
473+ copy_bytes,
474+ cudaMemcpyDeviceToDevice);
475+
476+ ET_CHECK_OR_RETURN_ERROR (
477+ cuda_err == cudaSuccess,
478+ Internal,
479+ " Failed GPU-to-GPU copy for input %zu: %s" ,
480+ i,
481+ cudaGetErrorString (cuda_err));
482+
483+ // Skip the CPU-to-GPU copy below
484+ continue ;
485+ }
469486 }
470487 }
471488 }
@@ -474,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final
474491 ET_CHECK_OR_RETURN_ERROR (
475492 aoti_torch_copy_ (gpu_inputs[i], cpu_tensor, 0 ) == Error::Ok,
476493 Internal,
477- " Failed to copy input %d from CPU to GPU" ,
494+ " Failed to copy input %zu from CPU to GPU" ,
478495 i);
479496 }
480497 // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -501,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final
501518 ET_CHECK_OR_RETURN_ERROR (
502519 create_err == Error::Ok,
503520 Internal,
504- " Failed to create GPU tensor for output %d " ,
521+ " Failed to create GPU tensor for output %zu " ,
505522 i);
506523
507524 gpu_outputs[i] = gpu_output_handle;
@@ -563,11 +580,11 @@ class ET_EXPERIMENTAL CudaBackend final
563580 // For DYNAMIC_BOUND tensors we try to resize
564581 ET_CHECK_OK_OR_RETURN_ERROR (
565582 resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
566- " Error resizing tensor at output index %d " ,
583+ " Error resizing tensor at output index %zu " ,
567584 i);
568585 ET_CHECK_OK_OR_RETURN_ERROR (
569586 aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
570- " Failed to copy GPU output %d back to CPU" ,
587+ " Failed to copy GPU output %zu back to CPU" ,
571588 i);
572589 }
573590
0 commit comments