Fix race in the CUDA DeepSeek FA kernel (#406)

ikawrakow · Iwan Kawrakow · web-flow · commit 36e6e888b75a · 2025-05-11T08:12:47.000+03:00
Reference: ggml-org/llama.cpp#13438 Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
diff --git a/ggml/src/ggml-cuda/fattn-new-mma.cu b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
         }
 
+        __syncthreads();
+
         // Write back combined meta data:
 #pragma unroll
         for (int imeta = 0; imeta < nmeta; ++imeta) {

Original file line number	Diff line number	Diff line change
`@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`898`	`898`	`KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);`
`899`	`899`	`}`
`900`	`900`
	`901`	`+ __syncthreads();`
	`902`	`+`
`901`	`903`	`// Write back combined meta data:`
`902`	`904`	`#pragma unroll`
`903`	`905`	`for (int imeta = 0; imeta < nmeta; ++imeta) {`