Skip to content

Commit bfdca26

Browse files
author
Jeemzz
committed
Replace copy kernel with cudaMemcpyAsync
1 parent e38e857 commit bfdca26

File tree

1 file changed

+1
-38
lines changed

1 file changed

+1
-38
lines changed

ggml/src/ggml-cuda/set.cu

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,6 @@
11
#include "ggml-cuda/common.cuh"
22
#include "set.cuh"
33

4-
static __global__ void set_f32_cuda_copy(const float * __restrict__ src0,
5-
float * __restrict__ dst,
6-
const size_t ne0,
7-
const size_t ne1,
8-
const size_t ne2,
9-
const size_t ne3,
10-
const size_t nb0,
11-
const size_t nb1,
12-
const size_t nb2,
13-
const size_t nb3) {
14-
const size_t total = ne0 * ne1 * ne2 * ne3;
15-
const size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
16-
if (gid >= total) {
17-
return;
18-
}
19-
20-
size_t tmp = gid;
21-
22-
const size_t i0 = tmp % ne0;
23-
tmp /= ne0;
24-
const size_t i1 = tmp % ne1;
25-
tmp /= ne1;
26-
const size_t i2 = tmp % ne2;
27-
tmp /= ne2;
28-
const size_t i3 = tmp;
29-
30-
const size_t pos = (i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
31-
32-
*((float *) ((char *) dst + pos)) = *((const float *) ((const char *) src0 + pos));
33-
}
34-
354
static __global__ void set_f32_cuda(const float * __restrict__ src1,
365
float * __restrict__ dst,
376
const size_t ne10,
@@ -100,16 +69,10 @@ void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
10069

10170
if (!inplace) {
10271
// copy whole src0 -> dst.
103-
const size_t total = ne00 * ne01 * ne02 * ne03;
104-
105-
const int num_blocks = (total + CUDA_SET_BLOCK_SIZE - 1) / CUDA_SET_BLOCK_SIZE;
106-
107-
set_f32_cuda_copy<<<num_blocks, CUDA_SET_BLOCK_SIZE, 0, stream>>>(
108-
src0_d, dst_d, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03);
72+
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, stream));
10973
}
11074

11175
// set: src1 -> dst
112-
// set_f32_cuda
11376

11477
const size_t total = ne10 * ne11 * ne12 * ne13;
11578
const size_t num_blocks = (total + CUDA_SET_BLOCK_SIZE - 1) / CUDA_SET_BLOCK_SIZE;

0 commit comments

Comments
 (0)