Adds mask reduction utility function

algo-home · algo-home · commit 510aaf522e7c · 2025-09-17T12:49:05.000+08:00
Implements a device function that performs logical OR reduction across mask tensor elements and synchronizes the result across thread blocks using warp-level primitives.

Enables efficient sparse attention pattern processing by allowing threads to collectively determine if any mask elements are active within a given region.
diff --git a/csrc/flash_dmattn/src/utils.h b/csrc/flash_dmattn/src/utils.h
@@ -333,6 +333,25 @@ __forceinline__ __device__ void sparse_gemm_rs(
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename Tensor, typename ThrCopy>
+__forceinline__ __device__ void mask_or_reduce(
+    Tensor &tSsMask,
+    bool &active,
+    ThrCopy smem_thr_copy_Mask
+) {
+    Tensor tSsMask_copy_view = smem_thr_copy_Mask.retile_D(tSsMask);
+    bool active_local = false;
+    #pragma unroll
+    for (int i = 0; i < size(tSsMask_copy_view); ++i) {
+        active_local |= tSsMask_copy_view(i);
+    }
+    active = __syncthreads_or(active_local);
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))