Skip to content

Commit fefb7a9

Browse files
committed
Synchronizes async preload before Q scaling
Waits for outstanding async loads and syncs threads so Q scaling never races ahead of shared-memory tiles.
1 parent df74af5 commit fefb7a9

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

csrc/flash_dmattn/src/flash_fwd_kernel.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,9 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
422422
binfo.actual_seqlen_k, binfo.actual_seqlen_q
423423
);
424424

425+
FLASH_NAMESPACE::cp_async_wait<0>();
426+
__syncthreads();
427+
425428
// Scale Q once before streaming loop KV
426429
if constexpr (Kernel_traits::Is_Q_in_regs) {
427430
#pragma unroll
@@ -1140,6 +1143,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
11401143
binfo.actual_seqlen_k, binfo.actual_seqlen_q
11411144
);
11421145

1146+
FLASH_NAMESPACE::cp_async_wait<0>();
1147+
__syncthreads();
1148+
11431149
// Scale Q once before streaming loop KV
11441150
#pragma unroll
11451151
for (int i = 0; i < size(tSrQ); ++i) {

0 commit comments

Comments
 (0)