|
7 | 7 |
|
8 | 8 | typedef void (*cpy_kernel_t)(const char * cx, char * cdst); |
9 | 9 |
|
| 10 | +template<typename T> |
| 11 | +static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int ne_elements, |
| 12 | + T ** cdst_indirect, int graph_cpynode_index) { |
| 13 | + const int64_t tid = blockDim.x * blockIdx.x + threadIdx.x; |
| 14 | + const int64_t stride = blockDim.x * gridDim.x; |
| 15 | + |
| 16 | + T * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst_direct; |
| 17 | + |
| 18 | + const int elements_per_thread = 4; |
| 19 | + for (int64_t base_idx = tid * elements_per_thread; base_idx < ne_elements; base_idx += stride * elements_per_thread) { |
| 20 | + const int64_t remaining = ne_elements - base_idx; |
| 21 | + |
| 22 | + if (remaining >= elements_per_thread) { |
| 23 | + if (base_idx % 4 == 0) { |
| 24 | + *((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx)); |
| 25 | + } else { |
| 26 | + for (int j = 0; j < elements_per_thread && base_idx + j < ne_elements; ++j) { |
| 27 | + cdst[base_idx + j] = cx[base_idx + j]; |
| 28 | + } |
| 29 | + } |
| 30 | + } else { |
| 31 | + for (int j = 0; j < remaining; ++j) { |
| 32 | + cdst[base_idx + j] = cx[base_idx + j]; |
| 33 | + } |
| 34 | + } |
| 35 | + } |
| 36 | +} |
| 37 | + |
10 | 38 | template <cpy_kernel_t cpy_1> |
11 | 39 | static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne, |
12 | 40 | const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, |
@@ -138,6 +166,23 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des |
138 | 166 | #endif |
139 | 167 | } |
140 | 168 |
|
| 169 | +template<typename T> |
| 170 | +static void ggml_cpy_contiguous_cuda( |
| 171 | + const T * cx, T * cdst, const int ne_elements, |
| 172 | + cudaStream_t stream, T ** cdst_indirect, int & graph_cpynode_index) { |
| 173 | + |
| 174 | + if (ne_elements <= 0) { |
| 175 | + return; |
| 176 | + } |
| 177 | + |
| 178 | + const int elements_per_thread = 4; |
| 179 | + const int threads_needed = (ne_elements + elements_per_thread - 1) / elements_per_thread; |
| 180 | + const int num_blocks = max(1, min(65535, (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE)); |
| 181 | + |
| 182 | + cpy_contiguous<T><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> |
| 183 | + (cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++); |
| 184 | +} |
| 185 | + |
141 | 186 | template<typename src_t, typename dst_t> |
142 | 187 | static void ggml_cpy_flt_cuda( |
143 | 188 | const char * cx, char * cdst, const int ne, |
@@ -330,7 +375,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg |
330 | 375 | #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY |
331 | 376 | { |
332 | 377 | if (src0->type == GGML_TYPE_F32) { |
333 | | - ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); |
| 378 | + ggml_cpy_contiguous_cuda<float>((const float*)src0_ddc, (float*)src1_ddc, ne, main_stream, (float**)dest_ptrs_d, graph_cpynode_index); |
334 | 379 | } else { |
335 | 380 | CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); |
336 | 381 | } |
@@ -407,7 +452,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { |
407 | 452 | // Prioritize CUDA graph compatibility over direct memory copy optimization. |
408 | 453 | // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs. |
409 | 454 | if (src0->type == GGML_TYPE_F32) { |
410 | | - return (void*) cpy_flt<cpy_1_flt<float, float>>; |
| 455 | + return (void*) cpy_contiguous<float>; |
411 | 456 | } else { |
412 | 457 | return nullptr; |
413 | 458 | } |
|
0 commit comments