Hide memory-latency by loop unrolling in reduce_rows_f32

ORippler · ORippler · commit c270ffe1dbe0 · 2025-08-07T00:11:41.000-07:00
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -2,13 +2,26 @@
 
 // Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
 template<bool norm>
-static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
+static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
     const int row = blockIdx.x;
     const int col = threadIdx.x;
 
     float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
+    const int num_unroll = 24;
+    float temp[num_unroll];
+    for (int i = col; i < ncols;) {
+        for (int j = 0; j < num_unroll; ++j){
+            if (i < ncols){
+                temp[j] = x[row * ncols + i];
+            }
+            else {
+                temp[j] = 0;
+            }
+            i += blockDim.x;
+        }
+        for (int j = 0; j < num_unroll; ++j){
+            sum += temp[j];
+        }
     }
 
     sum = warp_reduce_sum(sum);