Use ElementMask and split mask copy/reduce

algo-home · algo-home · commit bbfbbc37caf4 · 2025-10-02T00:38:18.000+08:00
Standardizes mask dtype to an explicit element type in global/shared memory to fix type mismatches and ensure alignment.

Aligns the shared mask buffer via a placeholder and updates the layout to avoid misaligned accesses.

Replaces fused mask copy+reduce with a generic copy followed by an explicit OR-reduction and barrier for clearer synchronization and correctness.

Unifies bias handling onto the generic copy path.
diff --git a/csrc/flash_dmattn/src/flash_fwd_kernel.h b/csrc/flash_dmattn/src/flash_fwd_kernel.h
@@ -54,6 +54,7 @@ template<typename Kernel_traits, bool Is_causal, bool Has_mask, bool Has_bias, b
 inline __device__ void compute_attn_1rowblock(const Params &params, const int bidb, const int bidh, const int m_block) {
 
     using Element = typename Kernel_traits::Element;
+    using ElementMask = typename Kernel_traits::ElementMask;
     using ElementAccum = typename Kernel_traits::ElementAccum;
     using index_t = typename Kernel_traits::index_t;
 
@@ -169,7 +170,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
         make_coord(_, 0)
     );  // (kBlockN, kHeadDim, nblocksN)
     Tensor mMask = make_tensor(
-        make_gmem_ptr(reinterpret_cast<const bool*>(params.mask_ptr) + binfo.mask_offset(params.mask_batch_stride, params.mask_row_stride, bidb)),
+        make_gmem_ptr(reinterpret_cast<ElementMask*>(params.mask_ptr) + binfo.mask_offset(params.mask_batch_stride, params.mask_row_stride, bidb)),
         make_shape(params.h_mask, binfo.actual_seqlen_q, binfo.actual_seqlen_k),
         make_stride(params.mask_head_stride, params.mask_row_stride, _1{})
     );
@@ -216,13 +217,17 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
         sV.data().get(),
         typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{}
     );
-    Tensor sMask = make_tensor(
+    Tensor sMaskPlace = make_tensor(
         Has_mask ? sV.data() + size(sV) : sV.data(),
-        typename Kernel_traits::SmemLayoutAtomPS{}
+        typename Kernel_traits::SmemLayoutPS{}
+    );  // For pointers alignment only
+    Tensor sMask = make_tensor(
+        make_smem_ptr(reinterpret_cast<ElementMask*>(sMaskPlace.data().get())),
+        typename Kernel_traits::SmemLayoutPS{}
     );
     Tensor sBias = make_tensor(
-        Has_bias ? (Has_mask ? sMask.data() + size(sMask) : sV.data() + size(sV)) : sV.data(),
-        typename Kernel_traits::SmemLayoutAtomPS{}
+        Has_bias ? (Has_mask ? sMaskPlace.data() + size(sMaskPlace) : sV.data() + size(sV)) : sV.data(),
+        typename Kernel_traits::SmemLayoutPS{}
     );
 
     // Global to Shared Memory operation
@@ -364,25 +369,19 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     }
 
     if constexpr (Has_mask) {
-        // FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
-        //     gmem_tiled_copy_Mask,
-        //     tMaskgMask(_, _, _, n_block), tMasksMask,
-        //     tMaskcMask, tMaskpMask,
-        //     binfo.actual_seqlen_q - m_block * kBlockM
-        // );
-        // cute::cp_async_fence();
-        // FLASH_NAMESPACE::cp_async_wait<0>();
-        // // Do OR-reduce on the mask to see if any active threads
-
-
-        FLASH_NAMESPACE::copy_mask_with_or_reduce<Is_even_MN, /*Clear_OOB_MN=*/true, /*To_type=*/Element>(
+        FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
             gmem_tiled_copy_Mask,
             tMaskgMask(_, _, _, n_block), tMasksMask,
-            any_active,
             tMaskcMask, tMaskpMask,
             binfo.actual_seqlen_q - m_block * kBlockM
         );
-        // We don't need to syncthreads here because copy_mask is already or_syncthreads.
+        __syncthreads();
+        // Do OR-reduce on the mask to see if any active threads for current iteration.
+        FLASH_NAMESPACE::mask_or_reduce(
+            tMasksMask,
+            any_active,
+            smem_thr_copy_Mask
+        );
     }
 
     // We don't need to clear the sK smem tiles since we'll mask out the scores anyway.
@@ -394,7 +393,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
             binfo.actual_seqlen_k - n_block * kBlockN
         );
         if constexpr (Has_bias) {
-            FLASH_NAMESPACE::copy_bias<Is_even_MN, /*Clear_OOB_MN=*/true>(
+            FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                 gmem_tiled_copy_Bias,
                 tBiasgBias(_, _, _, n_block), tBiassBias,
                 tBiascBias, tBiaspBias,
@@ -524,24 +523,19 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
         if (n_block > n_block_min) {
             if constexpr (Has_mask) {
-                // FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
-                //     gmem_tiled_copy_Mask,
-                //     tMaskgMask(_, _, _, n_block - 1), tMasksMask, 
-                //     tMaskcMask, tMaskpMask,
-                //     binfo.actual_seqlen_q - m_block * kBlockM
-                // );
-                // cute::cp_async_fence();
-                // FLASH_NAMESPACE::cp_async_wait<0>();
-                // // Do OR-reduce on the mask to see if any active threads for next iteration.
-
-                FLASH_NAMESPACE::copy_mask_with_or_reduce<Is_even_MN, /*Clear_OOB_MN=*/true, /*To_type=*/Element>(
+                FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                     gmem_tiled_copy_Mask,
                     tMaskgMask(_, _, _, n_block - 1), tMasksMask,
-                    any_active_next,
                     tMaskcMask, tMaskpMask,
                     binfo.actual_seqlen_q - m_block * kBlockM
                 );
-                // We don't need to syncthreads here because copy_mask is already or_syncthreads.
+                __syncthreads();
+                // Do OR-reduce on the mask to see if any active threads for next iteration.
+                FLASH_NAMESPACE::mask_or_reduce(
+                    tMasksMask,
+                    any_active_next,
+                    smem_thr_copy_Mask
+                );
             }
 
             if (any_active_next) {
@@ -551,7 +545,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
                     tKVcKV, tKVpKV
                 );
                 if constexpr (Has_bias) {
-                    FLASH_NAMESPACE::copy_bias<Is_even_MN, /*Clear_OOB_MN=*/true>(
+                    FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                         gmem_tiled_copy_Bias,
                         tBiasgBias(_, _, _, n_block - 1), tBiassBias,
                         tBiascBias, tBiaspBias,
@@ -684,24 +678,19 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
         if (n_block > n_block_min) {
             if constexpr (Has_mask) {
-                // FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
-                //     gmem_tiled_copy_Mask,
-                //     tMaskgMask(_, _, _, n_block - 1), tMasksMask,
-                //     tMaskcMask, tMaskpMask,
-                //     binfo.actual_seqlen_q - m_block * kBlockM
-                // );
-                // cute::cp_async_fence();
-                // FLASH_NAMESPACE::cp_async_wait<0>();
-                // // Do OR-reduce on the mask to see if any active threads for next iteration.
-
-                FLASH_NAMESPACE::copy_mask_with_or_reduce</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false, /*To_type=*/Element>(
+                FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
                     gmem_tiled_copy_Mask,
                     tMaskgMask(_, _, _, n_block - 1), tMasksMask,
-                    any_active_next,
                     tMaskcMask, tMaskpMask,
                     binfo.actual_seqlen_q - m_block * kBlockM
                 );
-                // We don't need to syncthreads here because copy_mask is already or_syncthreads
+                __syncthreads();
+                // Do OR-reduce on the mask to see if any active threads for next iteration.
+                FLASH_NAMESPACE::mask_or_reduce(
+                    tMasksMask,
+                    any_active_next,
+                    smem_thr_copy_Mask
+                );
             }
     
             if (any_active_next) {
@@ -711,7 +700,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
                     tKVcKV, tKVpKV
                 );
                 if constexpr (Has_bias) {
-                    FLASH_NAMESPACE::copy_bias</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
+                    FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
                         gmem_tiled_copy_Bias,
                         tBiasgBias(_, _, _, n_block - 1), tBiassBias,
                         tBiascBias, tBiaspBias,
@@ -834,6 +823,7 @@ template<typename Kernel_traits, bool Is_causal, bool Has_mask, bool Has_bias, b
 inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, const int bidb, const int bidh, const int m_block, const int n_split_idx, const int num_n_splits) {
 
     using Element = typename Kernel_traits::Element;
+    using ElementMask = typename Kernel_traits::ElementMask;
     using ElementAccum = typename Kernel_traits::ElementAccum;
     using index_t = typename Kernel_traits::index_t;
 
@@ -970,7 +960,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         make_stride(params.v_row_stride, _1{})
     );
     Tensor gMask = make_tensor(
-        make_gmem_ptr(reinterpret_cast<const bool *>(params.mask_ptr) + col_offset_mask),
+        make_gmem_ptr(reinterpret_cast<ElementMask *>(params.mask_ptr) + col_offset_mask),
         Shape<Int<kBlockM>, Int<kBlockN>>{},
         make_stride(params.mask_row_stride, _1{})
     );
@@ -1001,13 +991,17 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         sV.data().get(),
         typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{}
     );
-    Tensor sMask = make_tensor(
+    Tensor sMaskPlace = make_tensor(
         Has_mask ? sV.data() + size(sV) : sV.data(),
-        typename Kernel_traits::SmemLayoutAtomPS{}
+        typename Kernel_traits::SmemLayoutPS{}
+    );  // For pointers alignment only
+    Tensor sMask = make_tensor(
+        make_smem_ptr(reinterpret_cast<ElementMask*>(sMaskPlace.data().get())),
+        typename Kernel_traits::SmemLayoutPS{}
     );
     Tensor sBias = make_tensor(
-        Has_bias ? (Has_mask ? sMask.data() + size(sMask) : sV.data() + size(sV)) : sV.data(),
-        typename Kernel_traits::SmemLayoutAtomPS{}
+        Has_bias ? (Has_mask ? sMaskPlace.data() + size(sMaskPlace) : sV.data() + size(sV)) : sV.data(),
+        typename Kernel_traits::SmemLayoutPS{}
     );
 
     // Global to Shared Memory operation
@@ -1115,24 +1109,19 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
     );
 
     if constexpr (Has_mask) {
-        // FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
-        //     gmem_tiled_copy_Mask,
-        //     tMaskgMask, tMasksMask,
-        //     tMaskcMask, tMaskpMask,
-        //     binfo.actual_seqlen_q - m_block * kBlockM
-        // );
-        // cute::cp_async_fence();
-        // FLASH_NAMESPACE::cp_async_wait<0>();
-        // // Do OR-reduce on the mask to see if any active threads
-
-        FLASH_NAMESPACE::copy_mask_with_or_reduce<Is_even_MN, /*Clear_OOB_MN=*/true, /*To_type=*/Element>(
+        FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
             gmem_tiled_copy_Mask,
             tMaskgMask, tMasksMask,
-            any_active,
             tMaskcMask, tMaskpMask,
             binfo.actual_seqlen_q - m_block * kBlockM
         );
-        // We don't need to syncthreads here because copy_mask is already or_syncthreads.
+        __syncthreads();
+        // Do OR-reduce on the mask to see if any active threads for current iteration.
+        FLASH_NAMESPACE::mask_or_reduce(
+            tMasksMask,
+            any_active,
+            smem_thr_copy_Mask
+        );
     }
 
     // We don't need to clear the sK smem tiles since we'll mask out the scores anyway.
@@ -1144,7 +1133,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
             binfo.actual_seqlen_k - n_block * kBlockN
         );
         if constexpr (Has_bias) {
-            FLASH_NAMESPACE::copy_bias<Is_even_MN, /*Clear_OOB_MN=*/true>(
+            FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                 gmem_tiled_copy_Bias,
                 tBiasgBias, tBiassBias,
                 tBiascBias, tBiaspBias,
@@ -1305,24 +1294,19 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
             }
 
             if constexpr (Has_mask) { 
-                // FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
-                //     gmem_tiled_copy_Mask,
-                //     tMaskgMask, tMasksMask, 
-                //     tMaskcMask, tMaskpMask,
-                //     binfo.actual_seqlen_q - m_block * kBlockM
-                // );
-                // cute::cp_async_fence();
-                // FLASH_NAMESPACE::cp_async_wait<0>();
-                // // Do OR-reduce on the mask to see if any active threads for next iteration.
-                
-                FLASH_NAMESPACE::copy_mask_with_or_reduce<Is_even_MN, /*Clear_OOB_MN=*/true, /*To_type=*/Element>(
+                FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                     gmem_tiled_copy_Mask,
                     tMaskgMask, tMasksMask,
-                    any_active_next,
                     tMaskcMask, tMaskpMask,
                     binfo.actual_seqlen_q - m_block * kBlockM
                 );
-                // We don't need to syncthreads here because copy_mask is already or_syncthreads.
+                __syncthreads();
+                // Do OR-reduce on the mask to see if any active threads for next iteration.
+                FLASH_NAMESPACE::mask_or_reduce(
+                    tMasksMask,
+                    any_active_next,
+                    smem_thr_copy_Mask
+                );
             }
 
             if (any_active_next) {
@@ -1332,9 +1316,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
                     tKVcKV, tKVpKV
                 );
                 if constexpr (Has_bias) {
-                    FLASH_NAMESPACE::copy_bias<Is_even_MN, /*Clear_OOB_MN=*/true>(
+                    FLASH_NAMESPACE::copy_MN<Is_even_MN, /*Clear_OOB_MN=*/true>(
                         gmem_tiled_copy_Bias,
-                        tBiasgBias, tBiassBias, 
+                        tBiasgBias, tBiassBias,
                         tBiascBias, tBiaspBias,
                         binfo.actual_seqlen_q - m_block * kBlockM
                     );
@@ -1492,24 +1476,19 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
             }
 
             if constexpr (Has_mask) {
-                // FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
-                //     gmem_tiled_copy_Mask,
-                //     tMaskgMask, tMasksMask,
-                //     tMaskcMask, tMaskpMask,
-                //     binfo.actual_seqlen_q - m_block * kBlockM
-                // );
-                // cute::cp_async_fence();
-                // FLASH_NAMESPACE::cp_async_wait<0>();
-                // // Do OR-reduce on the mask to see if any active threads for next iteration.
-
-                FLASH_NAMESPACE::copy_mask_with_or_reduce</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false, /*To_type=*/Element>(
+                FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
                     gmem_tiled_copy_Mask,
                     tMaskgMask, tMasksMask,
-                    any_active_next,
                     tMaskcMask, tMaskpMask,
                     binfo.actual_seqlen_q - m_block * kBlockM
                 );
-                // We don't need to syncthreads here because copy_mask is already or_syncthreads.
+                __syncthreads();
+                // Do OR-reduce on the mask to see if any active threads for next iteration.
+                FLASH_NAMESPACE::mask_or_reduce(
+                    tMasksMask,
+                    any_active_next,
+                    smem_thr_copy_Mask
+                );
             }
 
             if (any_active_next) {
@@ -1519,9 +1498,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
                     tKVcKV, tKVpKV
                 );
                 if constexpr (Has_bias) {
-                    FLASH_NAMESPACE::copy_bias</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
+                    FLASH_NAMESPACE::copy_MN</*Is_even_MN=*/true, /*Clear_OOB_MN=*/false>(
                         gmem_tiled_copy_Bias,
-                        tBiasgBias, tBiassBias, 
+                        tBiasgBias, tBiassBias,
                         tBiascBias, tBiaspBias,
                         binfo.actual_seqlen_q - m_block * kBlockM
                     );