@@ -15,21 +15,16 @@ static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int n
1515
1616 T * cdst = (cdst_indirect != nullptr ) ? cdst_indirect[graph_cpynode_index] : cdst_direct;
1717
18- const int elements_per_thread = 4 ;
19- for (int64_t base_idx = tid * elements_per_thread; base_idx < ne_elements; base_idx += stride * elements_per_thread) {
18+ for (int64_t base_idx = tid * CUDA_CPY_ELEMENTS_PER_THREAD; base_idx < ne_elements; base_idx += stride * CUDA_CPY_ELEMENTS_PER_THREAD) {
2019 const int64_t remaining = ne_elements - base_idx;
2120
22- if (remaining >= elements_per_thread) {
23- if (base_idx % 4 == 0 ) {
24- *((float4 *)(cdst + base_idx)) = *((const float4 *)(cx + base_idx));
25- } else {
26- for (int j = 0 ; j < elements_per_thread && base_idx + j < ne_elements; ++j) {
27- cdst[base_idx + j] = cx[base_idx + j];
28- }
29- }
21+ if (remaining >= CUDA_CPY_ELEMENTS_PER_THREAD) {
22+ *((float4 *)(cdst + base_idx)) = *((const float4 *)(cx + base_idx));
3023 } else {
31- for (int j = 0 ; j < remaining; ++j) {
32- cdst[base_idx + j] = cx[base_idx + j];
24+ #pragma unroll
25+ for (int j = 0 ; j < CUDA_CPY_ELEMENTS_PER_THREAD; ++j) {
26+ size_t i = base_idx + (size_t )j;
27+ if (i < ne_elements) cdst[i] = cx[i];
3328 }
3429 }
3530 }
@@ -175,9 +170,8 @@ static void ggml_cpy_contiguous_cuda(
175170 return ;
176171 }
177172
178- const int elements_per_thread = 4 ;
179- const int threads_needed = (ne_elements + elements_per_thread - 1 ) / elements_per_thread;
180- const int num_blocks = max (1 , min (65535 , (threads_needed + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE));
173+ const int threads_needed = (ne_elements + CUDA_CPY_ELEMENTS_PER_THREAD - 1 ) / CUDA_CPY_ELEMENTS_PER_THREAD;
174+ const int num_blocks = (threads_needed + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
181175
182176 cpy_contiguous<T><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
183177 (cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++);
0 commit comments