Revert "CUDA: fix misaligned synchronization in FA (ggml-org#13469)"

Nexesenex · Nexesenex · commit 4721a56ae15b · 2025-05-14T20:24:32.000+02:00
This reverts commit 95e1888.
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -895,11 +895,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
             dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
         }
-    } else if (np > 1) {
-        // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
-        // Therefore, all other warps also need to execute a __syncthreads().
-        // Otherwise the points at which warps synchronize with each other would become misaligned.
-        __syncthreads();
     }
 
 #pragma unroll

Original file line number	Diff line number	Diff line change
`@@ -895,11 +895,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`895`	`895`	`float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;`
`896`	`896`	`dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);`
`897`	`897`	`}`
`898`		`- } else if (np > 1) {`
`899`		`- // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.`
`900`		`- // Therefore, all other warps also need to execute a __syncthreads().`
`901`		`- // Otherwise the points at which warps synchronize with each other would become misaligned.`
`902`		`- __syncthreads();`
`903`	`898`	`}`
`904`	`899`
`905`	`900`	`#pragma unroll`