We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c80f76a commit 9462a98Copy full SHA for 9462a98
ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -736,6 +736,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
736
float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
737
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
738
}
739
+ } else if (np > 1) {
740
+ // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
741
+ // Therefore, all other warps also need to execute a __syncthreads().
742
+ // Otherwise the points at which warps synchronize with each other would become misaligned.
743
+ __syncthreads();
744
745
746
if (np > 1) {
0 commit comments