|
1 | 1 | #include "ggml-cuda/common.cuh" |
2 | 2 | #include "set.cuh" |
3 | 3 |
|
4 | | -static __global__ void set_f32_cuda_copy(const float * __restrict__ src0, |
5 | | - float * __restrict__ dst, |
6 | | - const size_t ne0, |
7 | | - const size_t ne1, |
8 | | - const size_t ne2, |
9 | | - const size_t ne3, |
10 | | - const size_t nb0, |
11 | | - const size_t nb1, |
12 | | - const size_t nb2, |
13 | | - const size_t nb3) { |
14 | | - const size_t total = ne0 * ne1 * ne2 * ne3; |
15 | | - const size_t gid = blockIdx.x * blockDim.x + threadIdx.x; |
16 | | - if (gid >= total) { |
17 | | - return; |
18 | | - } |
19 | | - |
20 | | - size_t tmp = gid; |
21 | | - |
22 | | - const size_t i0 = tmp % ne0; |
23 | | - tmp /= ne0; |
24 | | - const size_t i1 = tmp % ne1; |
25 | | - tmp /= ne1; |
26 | | - const size_t i2 = tmp % ne2; |
27 | | - tmp /= ne2; |
28 | | - const size_t i3 = tmp; |
29 | | - |
30 | | - const size_t pos = (i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); |
31 | | - |
32 | | - *((float *) ((char *) dst + pos)) = *((const float *) ((const char *) src0 + pos)); |
33 | | -} |
34 | | - |
35 | 4 | static __global__ void set_f32_cuda(const float * __restrict__ src1, |
36 | 5 | float * __restrict__ dst, |
37 | 6 | const size_t ne10, |
@@ -100,16 +69,10 @@ void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
100 | 69 |
|
101 | 70 | if (!inplace) { |
102 | 71 | // copy whole src0 -> dst. |
103 | | - const size_t total = ne00 * ne01 * ne02 * ne03; |
104 | | - |
105 | | - const int num_blocks = (total + CUDA_SET_BLOCK_SIZE - 1) / CUDA_SET_BLOCK_SIZE; |
106 | | - |
107 | | - set_f32_cuda_copy<<<num_blocks, CUDA_SET_BLOCK_SIZE, 0, stream>>>( |
108 | | - src0_d, dst_d, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03); |
| 72 | + CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, stream)); |
109 | 73 | } |
110 | 74 |
|
111 | 75 | // set: src1 -> dst |
112 | | - // set_f32_cuda |
113 | 76 |
|
114 | 77 | const size_t total = ne10 * ne11 * ne12 * ne13; |
115 | 78 | const size_t num_blocks = (total + CUDA_SET_BLOCK_SIZE - 1) / CUDA_SET_BLOCK_SIZE; |
|
0 commit comments