Reapply "CUDA: fix misaligned synchronization in FA (ggml-org#13469)"

Nexesenex · Nexesenex · commit 9462a98c5f4c · 2025-05-25T20:40:54.000+02:00
This reverts commit 4721a56.
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -736,6 +736,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
             dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
         }
+    } else if (np > 1) {
+        // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
+        // Therefore, all other warps also need to execute a __syncthreads().
+        // Otherwise the points at which warps synchronize with each other would become misaligned.
+        __syncthreads();
     }
 
     if (np > 1) {

Original file line number	Diff line number	Diff line change
`@@ -736,6 +736,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`736`	`736`	`float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;`
`737`	`737`	`dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);`
`738`	`738`	`}`
	`739`	`+ } else if (np > 1) {`
	`740`	`+ // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.`
	`741`	`+ // Therefore, all other warps also need to execute a __syncthreads().`
	`742`	`+ // Otherwise the points at which warps synchronize with each other would become misaligned.`
	`743`	`+ __syncthreads();`
`739`	`744`	`}`
`740`	`745`
`741`	`746`	`if (np > 1) {`