refactor and update cpy_contiguous cuda kernel

anavp-nvidia · anavp-nvidia · commit 61bf5b0098e7 · 2025-10-09T09:57:25.000Z
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
@@ -15,21 +15,16 @@ static __global__ void cpy_contiguous(const T * cx, T * cdst_direct, const int n
 
     T * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst_direct;
 
-    const int elements_per_thread = 4;
-    for (int64_t base_idx = tid * elements_per_thread; base_idx < ne_elements; base_idx += stride * elements_per_thread) {
+    for (int64_t base_idx = tid * CUDA_CPY_ELEMENTS_PER_THREAD; base_idx < ne_elements; base_idx += stride * CUDA_CPY_ELEMENTS_PER_THREAD) {
         const int64_t remaining = ne_elements - base_idx;
 
-        if (remaining >= elements_per_thread) {
-            if (base_idx % 4 == 0) {
-                *((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx));
-            } else {
-                for (int j = 0; j < elements_per_thread && base_idx + j < ne_elements; ++j) {
-                    cdst[base_idx + j] = cx[base_idx + j];
-                }
-            }
+        if (remaining >= CUDA_CPY_ELEMENTS_PER_THREAD) {
+            *((float4*)(cdst + base_idx)) = *((const float4*)(cx + base_idx));
         } else {
-            for (int j = 0; j < remaining; ++j) {
-                cdst[base_idx + j] = cx[base_idx + j];
+            #pragma unroll
+            for (int j = 0; j < CUDA_CPY_ELEMENTS_PER_THREAD; ++j) {
+                size_t i = base_idx + (size_t)j;
+                if (i < ne_elements) cdst[i] = cx[i];
             }
         }
     }
@@ -175,9 +170,8 @@ static void ggml_cpy_contiguous_cuda(
         return;
     }
 
-    const int elements_per_thread = 4;
-    const int threads_needed = (ne_elements + elements_per_thread - 1) / elements_per_thread;
-    const int num_blocks = max(1, min(65535, (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE));
+    const int threads_needed = (ne_elements + CUDA_CPY_ELEMENTS_PER_THREAD - 1) / CUDA_CPY_ELEMENTS_PER_THREAD;
+    const int num_blocks = (threads_needed + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 
     cpy_contiguous<T><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
         (cx, cdst, ne_elements, cdst_indirect, graph_cpynode_index++);
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
@@ -1,6 +1,7 @@
 #include "common.cuh"
 
 #define CUDA_CPY_BLOCK_SIZE 64
+#define CUDA_CPY_ELEMENTS_PER_THREAD 4
 
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1,  bool disable_indirection = false);