kernel: use cp_async_zfill instead of cute::clear for oob handling

guocuimi · guocuimi · commit 951225f6047b · 2025-01-23T15:59:35.000-08:00
diff --git a/src/kernels/attention/attention_kernel_sm80.cuh b/src/kernels/attention/attention_kernel_sm80.cuh
@@ -143,10 +143,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
   auto produce_q = [&]() {
     auto tQgQ = gmem_thr_copy_Q.partition_S(gQ);
     auto tQsQ = gmem_thr_copy_Q.partition_D(sQ);
-    safe_copy<EVEN_K,
-              /*EVEN_MN=*/false,
-              /*ZERO_FILL_MN=*/true,
-              /*ZERO_FILL_K=*/true>(
+    safe_copy</*EVEN_MN=*/false, EVEN_K>(
         gmem_tiled_copy_Q,
         tQgQ,
         tQsQ,
@@ -159,10 +156,9 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
   auto produce_k = [&](int ni) {
     auto tKgK = gmem_thr_copy_KV.partition_S(gK(_, _, ni));
     // skip zfill_mn for k since mask will mask out oob with -inf
-    safe_copy<EVEN_K,
-              /*EVEN_MN=*/false,
-              /*ZERO_FILL_MN=*/false,
-              /*ZERO_FILL_K=*/true>(
+    safe_copy</*EVEN_MN=*/false,
+              EVEN_K,
+              /*ZERO_FILL_MN=*/false>(
         gmem_tiled_copy_KV,
         tKgK,
         tKsK,
@@ -174,10 +170,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
   auto produce_v = [&](int ni) {
     auto tVgV = gmem_thr_copy_KV.partition_S(gV(_, _, ni));
     // skipping ZFILL_MN for v may cause nan issue
-    safe_copy<EVEN_K,
-              /*EVEN_MN=*/false,
-              /*ZERO_FILL_MN=*/true,
-              /*ZERO_FILL_K=*/true>(
+    safe_copy</*EVEN_MN=*/false, EVEN_K>(
         gmem_tiled_copy_KV,
         tVgV,
         tVsV,
@@ -302,8 +295,8 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
 
     // wait for smem copy done before gmem copy
     __syncthreads();
-    safe_copy<EVEN_K,
-              /*EVEN_MN=*/false,
+    safe_copy</*EVEN_MN=*/false,
+              EVEN_K,
               /*ZERO_FILL_MN=*/false,
               /*ZERO_FILL_K=*/false>(
         gmem_tiled_copy_O,
diff --git a/src/kernels/attention/attention_traits_sm80.h b/src/kernels/attention/attention_traits_sm80.h
@@ -97,15 +97,15 @@ struct AttentionTraitsSM80 {
   // Tiled copy for QKV
   // g2s tiled copy for q
   using GmemTiledCopyQ = decltype(make_tiled_copy(
-      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, DType>{},
+      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<cute::uint128_t>, DType>{},
       GmemCopyThrLayout{},     // Thr layout: (_16,_8)/(_32, _4)
       Layout<Shape<_1, _8>>{}  // Val layout: 8 vals per read
       ));
 
   // g2s tiled copy for kv
   // TODO: choose based on BLK_K and kv cache type
   using GmemTiledCopyKV = decltype(make_tiled_copy(
-      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, KV_DType>{},
+      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<cute::uint128_t>, KV_DType>{},
       GmemCopyThrLayout{},     // Thr layout: (_16,_8)/(_32, _4)
       Layout<Shape<_1, _8>>{}  // Val layout: 8 vals per read
       ));
diff --git a/src/kernels/attention/cute_extensions.cuh b/src/kernels/attention/cute_extensions.cuh
@@ -22,22 +22,62 @@ CUTE_HOST_DEVICE constexpr auto elem_less(IntTupleA const& a,
   return elem_less(get<I>(a), get<I>(b));
 }
 
-template <bool EVEN_K,
-          bool EVEN_MN,
-          bool ZERO_FILL_MN,
-          bool ZERO_FILL_K,
-          class TiledCopy,
+template <class Copy_Atom, class TensorS, class TensorD>
+CUTE_HOST_DEVICE void zfill(const Copy_Atom& copy_atom,
+                            const TensorS& src,
+                            TensorD&& dst) {
+  CUTE_STATIC_ASSERT(TensorS::rank == TensorD::rank, "rank-mismatch.");
+
+  auto has_with_bool = cute::is_valid(
+      [](auto t) -> void_t<decltype(declval<typename decltype(t)::Traits>()
+                                        .with(true))> {},
+      copy_atom);
+  if constexpr (has_with_bool) {
+    constexpr int R = TensorD::rank;
+    if constexpr (R == 1) {  // Dispatch the copy
+      copy_atom.with(false).call(src, dst);
+    } else {  // Loop over all but the first mode
+      Tensor src_v = group_modes<1, R>(src);
+      Tensor dst_v = group_modes<1, R>(dst);
+      CUTE_UNROLL
+      for (int i = 0; i < size<1>(dst_v); ++i) {
+        copy_atom.with(false).call(src_v(_, i), dst_v(_, i));
+      }
+    }
+  } else {
+    // just call clear if no with method
+    clear(dst);
+  }
+}
+
+template <class Copy_Atom, class TensorS, class TensorD>
+CUTE_HOST_DEVICE void zfill(const Copy_Atom& copy_atom,
+                            const TensorS& src,
+                            TensorD& dst) {
+  zfill(copy_atom, src, dst);
+}
+
+template <bool EVEN_MN,
+          bool EVEN_K,
+          bool ZFILL_MN = true,
+          bool ZFILL_K = true,
+          class CopyAtom,
+          class TV,
+          class Tiler,
           class TensorS,
           class TensorD,
           class TensorC,
           class Coord>
 CUTE_HOST_DEVICE void safe_copy(
-    const TiledCopy& tiled_copy,
+    const TiledCopy<CopyAtom, TV, Tiler>& tiled_copy,
     const TensorS& src,       // (CPY, CPY_M/N, CPY_K)
     TensorD& dst,             // (CPY, CPY_M/N, CPY_K)
     const TensorC& identity,  // (CPY, CPY_M/N, CPY_K) -> (blk_m/n, blk_k)
     const Coord& max_coord    // max_coord(blk_m/n, blk_k)
 ) {
+  CUTE_STATIC_ASSERT(TensorS::rank == TensorD::rank, "rank-mismatch.");
+  auto copy_atom = static_cast<const CopyAtom&>(tiled_copy);
+
   if constexpr (!EVEN_MN && !EVEN_K) {
     // handle both m/n and k oob
     CUTE_UNROLL
@@ -46,16 +86,16 @@ CUTE_HOST_DEVICE void safe_copy(
         CUTE_UNROLL
         for (int ki = 0; ki < size<2>(src); ++ki) {
           if (elem_less<1>(identity(_0{}, _0{}, ki), max_coord)) {
-            copy(tiled_copy, src(_, mi, ki), dst(_, mi, ki));
+            copy(copy_atom, src(_, mi, ki), dst(_, mi, ki));
           } else {
-            if constexpr (ZERO_FILL_K) {
-              clear(dst(_, mi, ki));
+            if constexpr (ZFILL_K) {
+              zfill(copy_atom, src(_, mi, ki), dst(_, mi, ki));
             }
           }
         }
       } else {
-        if constexpr (ZERO_FILL_MN) {
-          clear(dst(_, mi, _));
+        if constexpr (ZFILL_MN) {
+          zfill(copy_atom, src(_, mi, _), dst(_, mi, _));
         }
       }
     }
@@ -64,10 +104,10 @@ CUTE_HOST_DEVICE void safe_copy(
     CUTE_UNROLL
     for (int mi = 0; mi < size<1>(src); ++mi) {
       if (elem_less<0>(identity(_0{}, mi, _0{}), max_coord)) {
-        copy(tiled_copy, src(_, mi, _), dst(_, mi, _));
+        copy(copy_atom, src(_, mi, _), dst(_, mi, _));
       } else {
-        if constexpr (ZERO_FILL_MN) {
-          clear(dst(_, mi, _));
+        if constexpr (ZFILL_MN) {
+          zfill(copy_atom, src(_, mi, _), dst(_, mi, _));
         }
       }
     }
@@ -76,16 +116,16 @@ CUTE_HOST_DEVICE void safe_copy(
     CUTE_UNROLL
     for (int ki = 0; ki < size<2>(src); ++ki) {
       if (elem_less<1>(identity(_0{}, _0{}, ki), max_coord)) {
-        copy(tiled_copy, src(_, _, ki), dst(_, _, ki));
+        copy(copy_atom, src(_, _, ki), dst(_, _, ki));
       } else {
-        if constexpr (ZERO_FILL_K) {
-          clear(dst(_, _, ki));
+        if constexpr (ZFILL_K) {
+          zfill(copy_atom, src(_, _, ki), dst(_, _, ki));
         }
       }
     }
   } else {
     // no oob, just copy
-    copy(tiled_copy, src, dst);
+    copy(copy_atom, src, dst);
   }
 }
 
diff --git a/src/kernels/attention/tools/attention_traits_viewer.cpp b/src/kernels/attention/tools/attention_traits_viewer.cpp
@@ -139,7 +139,7 @@ void test_attn_traits() {
   auto thr_mma = tiled_mma.get_slice(0);
   // (MMA, MMA_N, MMA_K)
   // ((_2,_2),_8,_4):((_1,_2),_16,_4)
-  auto tSrK = partition_fragment_B(thr_mma, sK);
+  auto tSrK = thr_mma.partition_fragment_B(sK);
   print(tSrK);print("\n");
 
   auto tSrK_fp8 = make_fragment_like<cute::int8_t>(tSrK);