diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 746f43966b84c..4322d4405c523 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -7,6 +7,29 @@ typedef void (*cpy_kernel_t)(const char * cx, char * cdst); +template +static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int ne_elements, + T ** cdst_indirect, int graph_cpynode_index) { + const int64_t tid = blockDim.x * blockIdx.x + threadIdx.x; + const int64_t stride = blockDim.x * gridDim.x; + + T * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst_direct; + + for (int64_t base_idx = tid * CUDA_CPY_ELEMENTS_PER_THREAD; base_idx < ne_elements; base_idx += stride * CUDA_CPY_ELEMENTS_PER_THREAD) { + const int64_t remaining = ne_elements - base_idx; + + if (remaining >= CUDA_CPY_ELEMENTS_PER_THREAD) { + *((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx)); + } else { + #pragma unroll + for (int j = 0; j < CUDA_CPY_ELEMENTS_PER_THREAD; ++j) { + size_t i = base_idx + (size_t)j; + if (i < ne_elements) cdst[i] = cx[i]; + } + } + } +} + template static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, @@ -138,6 +161,22 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des #endif } +template +static void ggml_cpy_contiguous_cuda( + const T * cx, T * cdst, const int ne_elements, + cudaStream_t stream, T ** cdst_indirect, int & graph_cpynode_index) { + + if (ne_elements <= 0) { + return; + } + + const int threads_needed = (ne_elements + CUDA_CPY_ELEMENTS_PER_THREAD - 1) / CUDA_CPY_ELEMENTS_PER_THREAD; + const int num_blocks = (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + + cpy_contiguous<<>> + (cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++); +} + template static void ggml_cpy_flt_cuda( const char * cx, char * cdst, const int ne, @@ -330,7 +369,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY { if (src0->type == GGML_TYPE_F32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_contiguous_cuda((const float*)src0_ddc, (float*)src1_ddc, ne, main_stream, (float**)dest_ptrs_d, graph_cpynode_index); } else { CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } @@ -407,7 +446,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { // Prioritize CUDA graph compatibility over direct memory copy optimization. // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs. if (src0->type == GGML_TYPE_F32) { - return (void*) cpy_flt>; + return (void*) cpy_contiguous; } else { return nullptr; } diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh index 0bd3c0c6f8c27..40d97b7769e05 100644 --- a/ggml/src/ggml-cuda/cpy.cuh +++ b/ggml/src/ggml-cuda/cpy.cuh @@ -1,6 +1,7 @@ #include "common.cuh" #define CUDA_CPY_BLOCK_SIZE 64 +#define CUDA_CPY_ELEMENTS_PER_THREAD 4 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);