Refactors copy function into specialized variants

algo-home · algo-home · commit a148a3a9c290 · 2025-09-17T12:52:01.000+08:00
Splits the generic copy_MN function into four specialized functions:
- copy_MN for basic tensor copying with tiled copy operations
- copy_mask for masked copying operations
- copy_mask_with_or_reduce for copying with OR reduction and block activity tracking
- copy_bias for bias-specific copying with element-wise assignment

Removes the Bool_to_Element template parameter and related conditional logic,
simplifying the codebase by creating purpose-specific functions instead of
a single overloaded function with multiple behaviors.
diff --git a/csrc/flash_dmattn/src/utils.h b/csrc/flash_dmattn/src/utils.h
@@ -540,13 +540,132 @@ __forceinline__ __device__ void copy(
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
-    bool Is_even_MN=true, bool Clear_OOB_MN=false, bool Bool_to_Element=false, typename To_type=void,
-    // typename TiledCopy, 
+    bool Is_even_MN=true, bool Clear_OOB_MN=false,
+    typename TiledCopy,
     typename Engine0, typename Layout0, typename Engine1, typename Layout1,
     typename Engine2, typename Layout2, typename Engine3, typename Layout3
 >
 __forceinline__ __device__ void copy_MN(
-    // TiledCopy tiled_copy,
+    TiledCopy tiled_copy,
+    Tensor<Engine0, Layout0> const &S, Tensor<Engine1, Layout1> &D,
+    Tensor<Engine2, Layout2> const &identity_MN,  Tensor<Engine3, Layout3> const &predicate_N,
+    const int max_M=0
+) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});          // (MMA, MMA_M, MMA_N) 
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});          // (MMA, MMA_M, MMA_N)
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));     // MMA_N
+    
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_M) {
+            #pragma unroll
+            for (int n = 0; n < size<2>(S); ++n) {
+                if (Is_even_MN || predicate_N(n)) {
+                    cute::copy(tiled_copy, S(_, m, n), D(_, m, n));
+                } else if (Clear_OOB_MN) {
+                    cute::clear(D(_, m, n));
+                }
+            }
+        } else if (Clear_OOB_MN) {
+            cute::clear(D(_, m, _));
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Is_even_MN=true, bool Clear_OOB_MN=false,
+    typename TiledCopy,
+    typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+    typename Engine2, typename Layout2, typename Engine3, typename Layout3
+>
+__forceinline__ __device__ void copy_mask(
+    TiledCopy tiled_copy,
+    Tensor<Engine0, Layout0> const &S, Tensor<Engine1, Layout1> &D,
+    Tensor<Engine2, Layout2> const &identity_MN,  Tensor<Engine3, Layout3> const &predicate_N,
+    const int max_M=0
+) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});          // (MMA, MMA_M, MMA_N) 
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});          // (MMA, MMA_M, MMA_N)
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));     // MMA_N
+
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_M) {
+            #pragma unroll
+            for (int n = 0; n < size<2>(S); ++n) {
+                if (Is_even_MN || predicate_N(n)) {
+                    cute::copy(tiled_copy, S(_, m, n), D(_, m, n));
+                } else if (Clear_OOB_MN) {
+                    cute::clear(D(_, m, n));
+                }
+            }
+        } else if (Clear_OOB_MN) {
+            cute::clear(D(_, m, _));
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Is_even_MN=true, bool Clear_OOB_MN=false, typename To_type=void,
+    typename TiledCopy,
+    typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+    typename Engine2, typename Layout2, typename Engine3, typename Layout3
+>
+__forceinline__ __device__ void copy_mask_with_or_reduce(
+    TiledCopy tiled_copy,
+    Tensor<Engine0, Layout0> const &S, Tensor<Engine1, Layout1> &D,
+    bool &block_active,
+    Tensor<Engine2, Layout2> const &identity_MN,  Tensor<Engine3, Layout3> const &predicate_N,
+    const int max_M=0
+) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});          // (MMA, MMA_M, MMA_N) 
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});          // (MMA, MMA_M, MMA_N)
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));     // MMA_N
+
+    bool any_active = false;
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_M) {
+            #pragma unroll
+            for (int n = 0; n < size<2>(S); ++n) {
+                if (Is_even_MN || predicate_N(n)) {
+                    #pragma unroll
+                    for (int i = 0; i < size<0>(S); ++i) {
+                        any_active |= S(i, m, n);
+                        D(i, m, n) = static_cast<To_type>(S(i, m, n));
+                    }
+                } else if (Clear_OOB_MN) {
+                    cute::clear(D(_, m, n));
+                }
+            }
+        } else if (Clear_OOB_MN) {
+            cute::clear(D(_, m, _));
+        }
+    }
+
+    block_active = __syncthreads_or(any_active);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Is_even_MN=true, bool Clear_OOB_MN=false,
+    typename TiledCopy,
+    typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+    typename Engine2, typename Layout2, typename Engine3, typename Layout3
+>
+__forceinline__ __device__ void copy_bias(
+    TiledCopy tiled_copy,
     Tensor<Engine0, Layout0> const &S, Tensor<Engine1, Layout1> &D,
     Tensor<Engine2, Layout2> const &identity_MN,  Tensor<Engine3, Layout3> const &predicate_N,
     const int max_M=0
@@ -563,18 +682,10 @@ __forceinline__ __device__ void copy_MN(
             #pragma unroll
             for (int n = 0; n < size<2>(S); ++n) {
                 if (Is_even_MN || predicate_N(n)) {
-                    if constexpr (Bool_to_Element) {
-                        #pragma unroll
-                        for (int i = 0; i < size<0>(S); ++i) {
-                            D(i, m, n) = static_cast<bool>(S(i, m, n)) ? To_type(1) : To_type(0);
-                        }   
-                    } else {
-                        // Using vectorized load will cause out-of-bounds access when !Is_even_MN && !predicate_N(n)
-                        // cute::copy(tiled_copy, S(_, m, n), D(_, m, n));
-                        #pragma unroll
-                        for (int i = 0; i < size<0>(S); ++i) {
-                            D(i, m, n) = S(i, m, n);
-                        }
+                    // cute::copy(tiled_copy, S(_, m, n), D(_, m, n));
+                    #pragma unroll
+                    for (int i = 0; i < size<0>(S); ++i) {
+                        D(i, m, n) = S(i, m, n);
                     }
                 } else if (Clear_OOB_MN) {
                     cute::clear(D(_, m, n));