fix loop unrolling for KV data load

JohannesGaessler · JohannesGaessler · commit dd0544657929 · 2025-05-06T21:53:29.000+02:00
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -104,75 +104,100 @@ struct fattn_mma_f16_config<576, 512> {
     static constexpr int  nbatch_combine = 128;
 };
 
-template<int stride_tile, int nwarps, int nbatch_fa, bool use_cp_async>
-static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
-        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV) {
+// ------------------------------------------------------------------------------------------------------------------
 
-    // K/V data is loaded with decreasing granularity for D for better memory bandwidth.
-    // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
+// The compiler is unable to unroll loops with the k0_start == k0_stop condition.
+// Therefore, write functions for the loop iterations and unroll the loops manually.
 
-    if (use_cp_async) {
-        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
+template<int stride_tile, int nwarps, int nbatch_fa, int stride_k>
+static __device__ __forceinline__ void flash_attn_ext_f16_load_tile_async_loop_iter_async(
+        const half2 * const __restrict__ KV, const unsigned int tile_KV_32, const int chunks_per_row, const int stride_KV) {
+    constexpr int preload = 64;
+    constexpr int h2_per_chunk = 16/sizeof(half2);
 
-        constexpr int preload = 64;
-        constexpr int h2_per_chunk = 16/sizeof(half2);
+    const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
+    const int k0_stop  =                             chunks_per_row - chunks_per_row % (1*stride_k);
+    const int stride_i = WARP_SIZE / stride_k;
 
-        const int chunks_per_row = D2 / h2_per_chunk;
+    if (k0_start == k0_stop) {
+        return;
+    }
 
 #pragma unroll
-        for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4, WARP_SIZE/8, WARP_SIZE/16}) {
-            const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
-            const int k0_stop  =                             chunks_per_row - chunks_per_row % (1*stride_k);
-            const int stride_i = WARP_SIZE / stride_k;
+    for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
+        const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
 
-            if (k0_start == k0_stop) {
-                continue;
-            }
+        if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
+            break;
+        }
 
 #pragma unroll
-            for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
-                const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+        for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+            const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
-                if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
-                    break;
-                }
+            cp_async_cg_16<preload>(tile_KV_32 + i*(stride_tile*sizeof(half2)) + k*16, KV + i*stride_KV + k*h2_per_chunk);
+        }
+    }
+}
+
+template<int stride_tile, int nwarps, int nbatch_fa, int stride_k>
+static __device__ __forceinline__ void flash_attn_ext_f16_load_tile_async_loop_iter_sync(
+        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV) {
+    const int k0_start = stride_k == WARP_SIZE ? 0 : D2 - D2 % (2*stride_k);
+    const int k0_stop  =                             D2 - D2 % (1*stride_k);
+    const int stride_i = WARP_SIZE / stride_k;
+
+    if (k0_start == k0_stop) {
+        return;
+    }
 
 #pragma unroll
-                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
-                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+    for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
+        const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
 
-                    cp_async_cg_16<preload>(tile_KV_32 + i*(stride_tile*sizeof(half2)) + k*16, KV + i*stride_KV + k*h2_per_chunk);
-                }
-            }
+        if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
+            break;
         }
-    } else {
-        static_assert(nbatch_fa % (4*nwarps) == 0, "out of bounds");
+
 #pragma unroll
-        for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-            const int k0_start = stride_k == WARP_SIZE ? 0 : D2 - D2 % (2*stride_k);
-            const int k0_stop  =                             D2 - D2 % (1*stride_k);
-            const int stride_i = WARP_SIZE / stride_k;
+        for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+            const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
-            if (k0_start == k0_stop || k0_stop <= 0) {
-                continue;
-            }
+            tile_KV[i*stride_tile + k] = KV[i*stride_KV + k];
+        }
+    }
+}
 
-#pragma unroll
-            for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
-                const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+template<int stride_tile, int nwarps, int nbatch_fa, bool use_cp_async>
+static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
+        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV) {
 
-                if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
-                    break;
-                }
+    // K/V data is loaded with decreasing granularity for D for better memory bandwidth.
+    // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
 
-#pragma unroll
-                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
-                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+    if (use_cp_async) {
+        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
+        constexpr int h2_per_chunk = 16/sizeof(half2);
+        const int chunks_per_row = D2 / h2_per_chunk;
 
-                    tile_KV[i*stride_tile + k] = KV[i*stride_KV + k];
-                }
-            }
-        }
+        flash_attn_ext_f16_load_tile_async_loop_iter_async<stride_tile, nwarps, nbatch_fa, WARP_SIZE>
+            (KV, tile_KV_32, chunks_per_row, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_async<stride_tile, nwarps, nbatch_fa, WARP_SIZE/2>
+            (KV, tile_KV_32, chunks_per_row, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_async<stride_tile, nwarps, nbatch_fa, WARP_SIZE/4>
+            (KV, tile_KV_32, chunks_per_row, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_async<stride_tile, nwarps, nbatch_fa, WARP_SIZE/8>
+            (KV, tile_KV_32, chunks_per_row, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_async<stride_tile, nwarps, nbatch_fa, WARP_SIZE/16>
+            (KV, tile_KV_32, chunks_per_row, stride_KV);
+    } else {
+        static_assert(nbatch_fa % (4*nwarps) == 0, "out of bounds");
+        flash_attn_ext_f16_load_tile_async_loop_iter_sync<stride_tile, nwarps, nbatch_fa, WARP_SIZE>
+            (KV, tile_KV, D2, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_sync<stride_tile, nwarps, nbatch_fa, WARP_SIZE/2>
+            (KV, tile_KV, D2, stride_KV);
+        flash_attn_ext_f16_load_tile_async_loop_iter_sync<stride_tile, nwarps, nbatch_fa, WARP_SIZE/4>
+            (KV, tile_KV, D2, stride_KV);
     }
 }
 
@@ -848,10 +873,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         }
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
-            if (offset >= WARP_SIZE) {
-                continue;
+            if (offset < WARP_SIZE) {
+                KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
             }
-            KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
         }
 
         float KQ_cms[nmeta]; // KQ combine max scale per warp.
@@ -867,10 +891,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         }
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
-            if (offset >= WARP_SIZE) {
-                continue;
+            if (offset < WARP_SIZE) {
+                KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
             }
-            KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
         }
 
         // Write back combined meta data: