@@ -38,25 +38,6 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne
3838 cpy_1 (cx + x_offset, cdst + dst_offset);
3939}
4040
41- template <typename src_t , typename dst_t >
42- static __global__ void cpy_flt_contiguous (const char * cx, char * cdst_direct, const int ne,
43- char ** cdst_indirect, int graph_cpynode_index) {
44- const int64_t i = blockDim .x *blockIdx .x + threadIdx .x ;
45-
46- if (i >= ne) {
47- return ;
48- }
49-
50- auto dst = (cdst_indirect != nullptr ) ? (dst_t *)cdst_indirect[graph_cpynode_index] : (dst_t *)cdst_direct;
51- auto src = (const src_t *)cx;
52-
53- if constexpr (std::is_same_v<dst_t , nv_bfloat16>) {
54- dst[i] = __float2bfloat16 (src[i]);
55- } else {
56- dst[i] = (dst_t )src[i];
57- }
58- }
59-
6041static __device__ void cpy_blck_q8_0_f32 (const char * cxi, char * cdsti) {
6142 float * cdstf = (float *)(cdsti);
6243
@@ -182,16 +163,6 @@ static void ggml_cpy_flt_cuda(
182163 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
183164}
184165
185- template <typename src_t , typename dst_t >
186- static void ggml_cpy_flt_contiguous_cuda (
187- const char * cx, char * cdst, const int ne,
188- cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
189-
190- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
191- cpy_flt_contiguous<src_t , dst_t ><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
192- (cx, cdst, ne, cdst_indirect, graph_cpynode_index++);
193- }
194-
195166static void ggml_cpy_f32_q8_0_cuda (
196167 const char * cx, char * cdst, const int ne,
197168 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -433,8 +404,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
433404 char * src0_ddc = (char *) src0->data ;
434405 char * src1_ddc = (char *) src1->data ;
435406
436- bool fast_cpy = ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && ggml_are_same_shape (src0, src1);
437-
438407 char ** dest_ptrs_d = nullptr ;
439408 int graph_cpynode_index = -1 ;
440409#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
@@ -460,23 +429,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
460429 }
461430 }
462431 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
463- if (fast_cpy) {
464- ggml_cpy_flt_contiguous_cuda<float , float >(src0_ddc, src1_ddc, ne, main_stream, dest_ptrs_d, graph_cpynode_index);
465- } else {
466- ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
467- }
432+ ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
468433 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
469- if (fast_cpy) {
470- ggml_cpy_flt_contiguous_cuda<float , nv_bfloat16>(src0_ddc, src1_ddc, ne, main_stream, dest_ptrs_d, graph_cpynode_index);
471- } else {
472- ggml_cpy_flt_cuda<float , nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
473- }
434+ ggml_cpy_flt_cuda<float , nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
474435 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
475- if (fast_cpy) {
476- ggml_cpy_flt_contiguous_cuda<float , half>(src0_ddc, src1_ddc, ne, main_stream, dest_ptrs_d, graph_cpynode_index);
477- } else {
478- ggml_cpy_flt_cuda<float , half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
479- }
436+ ggml_cpy_flt_cuda<float , half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
480437 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
481438 ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
482439 } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
@@ -548,7 +505,6 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
548505}
549506
550507void * ggml_cuda_cpy_fn (const ggml_tensor * src0, ggml_tensor * src1) {
551- bool fast_cpy = ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && ggml_are_same_shape (src0, src1);
552508 if (src0->type == src1->type && ggml_is_contiguous (src0) && ggml_is_contiguous (src1)) {
553509 // Prioritize CUDA graph compatibility over direct memory copy optimization.
554510 // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
@@ -558,11 +514,11 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
558514 return nullptr ;
559515 }
560516 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
561- return fast_cpy ? ( void *)cpy_flt_contiguous< float , float > : (void *) cpy_flt<cpy_1_flt<float , float >>;
517+ return (void *) cpy_flt<cpy_1_flt<float , float >>;
562518 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
563- return fast_cpy ? ( void *)cpy_flt_contiguous< float , nv_bfloat16> : (void *) cpy_flt<cpy_1_flt<float , nv_bfloat16>>;
519+ return (void *) cpy_flt<cpy_1_flt<float , nv_bfloat16>>;
564520 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
565- return fast_cpy ? ( void *)cpy_flt_contiguous< float , half> : (void *) cpy_flt<cpy_1_flt<float , half>>;
521+ return (void *) cpy_flt<cpy_1_flt<float , half>>;
566522 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
567523 return (void *) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
568524 } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
0 commit comments