@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
329
329
} else
330
330
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
331
331
{
332
- CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
332
+ if (src0->type == GGML_TYPE_F32) {
333
+ ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
334
+ } else {
335
+ CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
336
+ }
333
337
}
334
338
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
335
339
ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
400
404
401
405
void * ggml_cuda_cpy_fn (const ggml_tensor * src0, ggml_tensor * src1) {
402
406
if (src0->type == src1->type && ggml_is_contiguous (src0) && ggml_is_contiguous (src1)) {
403
- return nullptr ;
407
+ // Prioritize CUDA graph compatibility over direct memory copy optimization.
408
+ // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
409
+ if (src0->type == GGML_TYPE_F32) {
410
+ return (void *) cpy_flt<cpy_1_flt<float , float >>;
411
+ } else {
412
+ return nullptr ;
413
+ }
404
414
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
405
415
return (void *) cpy_flt<cpy_1_flt<float , float >>;
406
416
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
0 commit comments