Skip to content

Commit 0fbcf97

Browse files
committed
cuda copy kernel created for contiguous ggml tensors
1 parent a23b9bd commit 0fbcf97

File tree

1 file changed

+47
-2
lines changed

1 file changed

+47
-2
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,34 @@
77

88
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
99

10+
template<typename T>
11+
static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int ne_elements,
12+
T ** cdst_indirect, int graph_cpynode_index) {
13+
const int64_t tid = blockDim.x * blockIdx.x + threadIdx.x;
14+
const int64_t stride = blockDim.x * gridDim.x;
15+
16+
T * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst_direct;
17+
18+
const int elements_per_thread = 4;
19+
for (int64_t base_idx = tid * elements_per_thread; base_idx < ne_elements; base_idx += stride * elements_per_thread) {
20+
const int64_t remaining = ne_elements - base_idx;
21+
22+
if (remaining >= elements_per_thread) {
23+
if (base_idx % 4 == 0) {
24+
*((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx));
25+
} else {
26+
for (int j = 0; j < elements_per_thread && base_idx + j < ne_elements; ++j) {
27+
cdst[base_idx + j] = cx[base_idx + j];
28+
}
29+
}
30+
} else {
31+
for (int j = 0; j < remaining; ++j) {
32+
cdst[base_idx + j] = cx[base_idx + j];
33+
}
34+
}
35+
}
36+
}
37+
1038
template <cpy_kernel_t cpy_1>
1139
static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne,
1240
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -138,6 +166,23 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
138166
#endif
139167
}
140168

169+
template<typename T>
170+
static void ggml_cpy_contiguous_cuda(
171+
const T * cx, T * cdst, const int ne_elements,
172+
cudaStream_t stream, T ** cdst_indirect, int & graph_cpynode_index) {
173+
174+
if (ne_elements <= 0) {
175+
return;
176+
}
177+
178+
const int elements_per_thread = 4;
179+
const int threads_needed = (ne_elements + elements_per_thread - 1) / elements_per_thread;
180+
const int num_blocks = max(1, min(65535, (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE));
181+
182+
cpy_contiguous<T><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
183+
(cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++);
184+
}
185+
141186
template<typename src_t, typename dst_t>
142187
static void ggml_cpy_flt_cuda(
143188
const char * cx, char * cdst, const int ne,
@@ -330,7 +375,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
330375
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
331376
{
332377
if (src0->type == GGML_TYPE_F32) {
333-
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
378+
ggml_cpy_contiguous_cuda<float>((const float*)src0_ddc, (float*)src1_ddc, ne, main_stream, (float**)dest_ptrs_d, graph_cpynode_index);
334379
} else {
335380
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
336381
}
@@ -407,7 +452,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
407452
// Prioritize CUDA graph compatibility over direct memory copy optimization.
408453
// Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
409454
if (src0->type == GGML_TYPE_F32) {
410-
return (void*) cpy_flt<cpy_1_flt<float, float>>;
455+
return (void*) cpy_contiguous<float>;
411456
} else {
412457
return nullptr;
413458
}

0 commit comments

Comments
 (0)