kernel: fix register spilling issue for attention head_dim=256 (#397)

guocuimi · web-flow · commit 7044f667a804 · 2025-02-06T17:37:51.000-08:00
diff --git a/src/kernels/attention/mha_kernel_sm80.cuh b/src/kernels/attention/mha_kernel_sm80.cuh
@@ -299,21 +299,6 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     return;
   }
 
-  // ###############  Prologue  ###############
-  int n_block_idx = n_block_max - 1;
-  // produce query: [] => [q]
-  produce_query();
-  cp_async_fence();
-  // produce key: [q] => [q, k]
-  produce_key(n_block_idx);
-  cp_async_fence();
-
-  // ###############  Mainloop  ###############
-  // attention score accumulator, (MMA,MMA_M,MMA_N)
-  auto tSrAccS = partition_fragment_C(tiled_mma, Shape<_BLK_M, _BLK_N>{});
-  auto tSrAccS_rc_view =
-      make_tensor(tSrAccS.data(), Layout::to_rowcol(tSrAccS.layout()));
-
   auto apply_logits_soft_cap = [&](auto& tSrAccS) {
     if constexpr (SOFT_CAP) {
       CUTE_UNROLL
@@ -323,7 +308,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     }
   };
 
-  constexpr int kMMA_M = size<1>(tSrAccS);
+  constexpr int kMMA_M = size<1>(tOrAccO);
   using Softmax = OnlineSoftmax<kRowsPerMMA * kMMA_M>;
   using Mask = Mask<kBlockM, kBlockM, kRowsPerMMA, kMMA_M, ALIBI, LOCAL>;
 
@@ -338,12 +323,26 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
             sm_scale,
             params.alibi_slopes_ptr);
 
-  // seperate oob mask iterations for better performance
+  // ###############  Prologue  ###############
+  // produce query: [] => [q]
+  produce_query();
+  cp_async_fence();
+  // produce key: [q] => [q, k]
+  produce_key(n_block_max - 1);
+  cp_async_fence();
+
+  // ###############  Mainloop  ###############
   constexpr int n_oob_mask = cute::ceil_div(kBlockM, kBlockN) + 1;
+  const int n_blocks = n_block_max - n_block_min;
 
-  // oob mask iterations
-  CUTE_UNROLL
-  for (int i = 0; i < n_oob_mask; ++i) {
+  CUTE_NO_UNROLL
+  for (int i = 0; i < n_blocks; ++i) {
+    const int n_block_idx = n_block_max - 1 - i;
+
+    // attention score accumulator, (MMA,MMA_M,MMA_N)
+    auto tSrAccS = partition_fragment_C(tiled_mma, Shape<_BLK_M, _BLK_N>{});
+    auto tSrAccS_rc_view =
+        make_tensor(tSrAccS.data(), Layout::to_rowcol(tSrAccS.layout()));
     clear(tSrAccS);
 
     // wait key, queue: [q, k] => []
@@ -361,57 +360,20 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     // 1> S = Q@K.T
     compute_qk(tSrAccS);
 
-    if constexpr (SOFT_CAP) {
-      apply_logits_soft_cap(tSrAccS);
-    }
-    mask.apply(tSrAccS_rc_view, n_block_idx);
-    softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
-
     // wait value, [v] => []
     cp_async_wait<0>();
     __syncthreads();
 
-    // produce next key: [] => [k]
-    if (n_block_idx > n_block_min) {
-      produce_key_no_oob(n_block_idx - 1);
-    }
-    cp_async_fence();
-
-    // 2> O = softmax(S)*V
-    compute_sv(tSrAccS, tOrAccO);
-
-    --n_block_idx;
-    if (n_block_idx < n_block_min) {
-      // no more kv blocks to process
-      break;
-    }
-  }
-
-  // non-oob mask iterations
-  CUTE_NO_UNROLL
-  for (; n_block_idx >= n_block_min; --n_block_idx) {
-    clear(tSrAccS);
-
-    // wait key, queue: [q, k] => []
-    cp_async_wait<0>();
-    __syncthreads();
-
-    // produce value, [] => [v]
-    produce_value_no_oob(n_block_idx);
-    cp_async_fence();
-
-    // 1> S = Q@K.T
-    compute_qk(tSrAccS);
-
     if constexpr (SOFT_CAP) {
       apply_logits_soft_cap(tSrAccS);
     }
-    mask.apply</*OOB_MASK=*/false>(tSrAccS_rc_view, n_block_idx);
-    softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
 
-    // wait value, [v] => []
-    cp_async_wait<0>();
-    __syncthreads();
+    if (i < n_oob_mask) {
+      mask.apply(tSrAccS_rc_view, n_block_idx);
+    } else {
+      mask.apply</*OOB_MASK=*/false>(tSrAccS_rc_view, n_block_idx);
+    }
+    softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
 
     // produce next key: [] => [k]
     if (n_block_idx > n_block_min) {
diff --git a/src/kernels/attention/mha_sm80_bench.cu b/src/kernels/attention/mha_sm80_bench.cu
@@ -7,22 +7,10 @@
 #include "mha_dispatch_sm80.cuh"
 #include "mha_kernel_sm80.cuh"  // IWYU pragma: keep
 #include "mha_params.h"
+#include "static_dispatch.h"
 
 using namespace llm;
 
-#define DISPATCH_HEAD_DIM_(HEAD_DIM_V, HEAD_DIM_NAME, ...) \
-  [&] {                                                    \
-    if (HEAD_DIM_V <= 64) {                                \
-      constexpr static int HEAD_DIM_NAME = 64;             \
-      return __VA_ARGS__();                                \
-    } else if (HEAD_DIM_V <= 128) {                        \
-      constexpr static int HEAD_DIM_NAME = 128;            \
-      return __VA_ARGS__();                                \
-    } else {                                               \
-      assert(false);                                       \
-    }                                                      \
-  }()
-
 void mha_bench_sm80(nvbench::state& state) {
   // Collect CUPTI metrics
   state.collect_cupti_metrics();
@@ -82,7 +70,7 @@ void mha_bench_sm80(nvbench::state& state) {
   params.sliding_window = sliding_window;
 
   state.exec([&](nvbench::launch& launch) {
-    DISPATCH_HEAD_DIM_(head_dim, HEAD_DIM, [&] {
+    DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, [&] {
       run_mha_kernel_sm80<cute::half_t, HEAD_DIM>(params, launch.get_stream());
     });
   });
diff --git a/src/kernels/attention/mha_sm80_pagedkv_bench.cu b/src/kernels/attention/mha_sm80_pagedkv_bench.cu
@@ -8,22 +8,10 @@
 #include "mha_dispatch_sm80.cuh"
 #include "mha_kernel_sm80.cuh"  // IWYU pragma: keep
 #include "mha_params.h"
+#include "static_dispatch.h"
 
 using namespace llm;
 
-#define DISPATCH_HEAD_DIM_(HEAD_DIM_V, HEAD_DIM_NAME, ...) \
-  [&] {                                                    \
-    if (HEAD_DIM_V <= 64) {                                \
-      constexpr static int HEAD_DIM_NAME = 64;             \
-      return __VA_ARGS__();                                \
-    } else if (HEAD_DIM_V <= 128) {                        \
-      constexpr static int HEAD_DIM_NAME = 128;            \
-      return __VA_ARGS__();                                \
-    } else {                                               \
-      assert(false);                                       \
-    }                                                      \
-  }()
-
 void mha_bench_sm80(nvbench::state& state) {
   // Collect CUPTI metrics
   state.collect_cupti_metrics();
@@ -130,7 +118,7 @@ void mha_bench_sm80(nvbench::state& state) {
   params.block_cu_lens = block_cu_lens.const_data_ptr<int32_t>();
 
   state.exec([&](nvbench::launch& launch) {
-    DISPATCH_HEAD_DIM_(head_dim, HEAD_DIM, [&] {
+    DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, [&] {
       run_mha_kernel_sm80<cute::half_t, HEAD_DIM>(params, launch.get_stream());
     });
   });
diff --git a/src/kernels/attention/mha_traits_sm80.h b/src/kernels/attention/mha_traits_sm80.h
@@ -93,7 +93,7 @@ struct MHATraitsSM80 {
   // Tiled copy for QKV
   // g2s tiled copy for q
   using GmemTiledCopyQ = decltype(make_tiled_copy(
-      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<cute::uint128_t>, DType>{},
+      Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, DType>{},
       GmemCopyThrLayout{},     // Thr layout: (_16,_8)/(_32, _4)
       Layout<Shape<_1, _8>>{}  // Val layout: 8 vals per read
       ));
diff --git a/src/kernels/attention/online_softmax.cuh b/src/kernels/attention/online_softmax.cuh
@@ -52,53 +52,72 @@ struct OnlineSoftmax {
 
   // computes the softmax scores and rescales the output
   //  - score = exp(score - row_max`)
-  //  - O = O * s_scale
+  //  - o = o * s_scale
   //  - internal: row_sum = row_sum * s_scale + row_sum`
   template <typename FragmentS, typename FragmentO>
   CUTE_DEVICE void rescale(FragmentS& rAccS, FragmentO& rAccO) {
+    // row_max = max(row_max, scores)
+    FragmentT pre_row_max;
+    cute::copy(row_max_, pre_row_max);
     CUTE_UNROLL
     for (int si = 0; si < size<0>(rAccS); ++si) {
-      // rowmax across 4 threads
-      float cur_rowmax = row_max_(si);
+      float row_max = row_max_(si);
+      // rowmax within a thread
       CUTE_UNROLL
       for (int sj = 0; sj < size<1>(rAccS); ++sj) {
-        cur_rowmax = max(cur_rowmax, rAccS(si, sj));
+        row_max = max(row_max, rAccS(si, sj));
       }
-      cur_rowmax = detail::group_reduce_max<4>(cur_rowmax);
+      // rowmax across 4 threads
+      row_max_(si) = detail::group_reduce_max<4>(row_max);
+    }
 
-      // scores = exp(scores - row_max)
-      const float rowmax_scale = cur_rowmax * sm_scale_;
-      float cur_rowsum = 0;
+    // o = o * s_scale
+    CUTE_UNROLL
+    for (int si = 0; si < size<0>(rAccO); ++si) {
+      const float s_scale =
+          ptx::exp2((pre_row_max(si) - row_max_(si)) * sm_scale_);
+      CUTE_UNROLL
+      for (int sj = 0; sj < size<1>(rAccO); ++sj) {
+        rAccO(si, sj) *= s_scale;
+      }
+    }
+
+    // scores = exp(scores - row_max)
+    CUTE_UNROLL
+    for (int si = 0; si < size<0>(rAccS); ++si) {
+      const float rowmax_scale = row_max_(si) * sm_scale_;
       CUTE_UNROLL
       for (int sj = 0; sj < size<1>(rAccS); sj++) {
         rAccS(si, sj) = ptx::exp2(rAccS(si, sj) * sm_scale_ - rowmax_scale);
-        cur_rowsum += rAccS(si, sj);
       }
+    }
 
-      // scores_scale = exp(max - cur_rowmax)
-      const float scores_scale =
-          ptx::exp2(row_max_(si) * sm_scale_ - rowmax_scale);
-      // o_2 = o_1 * s_scale
+    // row_sum = row_sum * s_scale + row_sum`
+    CUTE_UNROLL
+    for (int si = 0; si < size<0>(rAccS); ++si) {
+      const float s_scale =
+          ptx::exp2((pre_row_max(si) - row_max_(si)) * sm_scale_);
+      row_sum_(si) *= s_scale;
       CUTE_UNROLL
-      for (int sj = 0; sj < size<1>(rAccO); ++sj) {
-        rAccO(si, sj) *= scores_scale;
+      for (int sj = 0; sj < size<1>(rAccS); sj++) {
+        // rowsum within a thread
+        row_sum_(si) += rAccS(si, sj);
       }
-
-      // update row_max and row_sum
-      row_max_(si) = cur_rowmax;
-      // s_2 = s_1 * s_scale + row_sum
-      row_sum_(si) = row_sum_(si) * scores_scale + cur_rowsum;
     }
   }
 
-  // finalizes the softmax computation with O = O / row_sum
+  // finalizes the softmax computation with o = o / row_sum
   template <typename FragmentO>
   CUTE_DEVICE void finalize(FragmentO& rAccO) {
     CUTE_UNROLL
-    for (int oi = 0; oi < size<0>(rAccO); ++oi) {
+    for (int i = 0; i < size(row_sum_); ++i) {
       // rowsum across 4 threads
-      row_sum_(oi) = detail::group_reduce_sum<4>(row_sum_(oi));
+      row_sum_(i) = detail::group_reduce_sum<4>(row_sum_(i));
+    }
 
+    // o = o / row_sum
+    CUTE_UNROLL
+    for (int oi = 0; oi < size<0>(rAccO); ++oi) {
       CUTE_UNROLL
       for (int oj = 0; oj < size<1>(rAccO); ++oj) {
         rAccO(oi, oj) *= ptx::rcp(row_sum_(oi));