Skip to content

Commit 61bf5b0

Browse files
committed
refactor and update cpy_contiguous cuda kernel
1 parent 3e32aa4 commit 61bf5b0

File tree

2 files changed

+10
-15
lines changed

2 files changed

+10
-15
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,16 @@ static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int n
1515

1616
T * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst_direct;
1717

18-
const int elements_per_thread = 4;
19-
for (int64_t base_idx = tid * elements_per_thread; base_idx < ne_elements; base_idx += stride * elements_per_thread) {
18+
for (int64_t base_idx = tid * CUDA_CPY_ELEMENTS_PER_THREAD; base_idx < ne_elements; base_idx += stride * CUDA_CPY_ELEMENTS_PER_THREAD) {
2019
const int64_t remaining = ne_elements - base_idx;
2120

22-
if (remaining >= elements_per_thread) {
23-
if (base_idx % 4 == 0) {
24-
*((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx));
25-
} else {
26-
for (int j = 0; j < elements_per_thread && base_idx + j < ne_elements; ++j) {
27-
cdst[base_idx + j] = cx[base_idx + j];
28-
}
29-
}
21+
if (remaining >= CUDA_CPY_ELEMENTS_PER_THREAD) {
22+
*((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx));
3023
} else {
31-
for (int j = 0; j < remaining; ++j) {
32-
cdst[base_idx + j] = cx[base_idx + j];
24+
#pragma unroll
25+
for (int j = 0; j < CUDA_CPY_ELEMENTS_PER_THREAD; ++j) {
26+
size_t i = base_idx + (size_t)j;
27+
if (i < ne_elements) cdst[i] = cx[i];
3328
}
3429
}
3530
}
@@ -175,9 +170,8 @@ static void ggml_cpy_contiguous_cuda(
175170
return;
176171
}
177172

178-
const int elements_per_thread = 4;
179-
const int threads_needed = (ne_elements + elements_per_thread - 1) / elements_per_thread;
180-
const int num_blocks = max(1, min(65535, (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE));
173+
const int threads_needed = (ne_elements + CUDA_CPY_ELEMENTS_PER_THREAD - 1) / CUDA_CPY_ELEMENTS_PER_THREAD;
174+
const int num_blocks = (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
181175

182176
cpy_contiguous<T><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
183177
(cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++);

ggml/src/ggml-cuda/cpy.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "common.cuh"
22

33
#define CUDA_CPY_BLOCK_SIZE 64
4+
#define CUDA_CPY_ELEMENTS_PER_THREAD 4
45

56
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
67

0 commit comments

Comments
 (0)