vectorch-ai
diff --git a/‎src/kernels/attention/attention_kernel_sm80.cuh‎
Lines changed: 44 additions & 34 deletions b/‎src/kernels/attention/attention_kernel_sm80.cuh‎
Lines changed: 44 additions & 34 deletions
diff --git a/‎src/kernels/attention/attention_kernel_sm80_test.cu‎
Lines changed: 0 additions & 1 deletion b/‎src/kernels/attention/attention_kernel_sm80_test.cu‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/kernels/attention/attention_launch_sm80.cuh‎
Lines changed: 8 additions & 4 deletions b/‎src/kernels/attention/attention_launch_sm80.cuh‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/kernels/attention/attention_params.h‎
Lines changed: 8 additions & 0 deletions b/‎src/kernels/attention/attention_params.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/kernels/attention/attention_tile.h‎
Lines changed: 93 additions & 46 deletions b/‎src/kernels/attention/attention_tile.h‎
Lines changed: 93 additions & 46 deletions
diff --git a/‎src/kernels/attention/attention_traits_test.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/kernels/attention/attention_traits_test.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -56,44 +56,34 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
 
   const int m_block = blockIdx.x;
   const int batch_idx = blockIdx.y;
-  const int head_idx = blockIdx.z;
+  const int kv_head_idx = blockIdx.z;
   const int tidx = threadIdx.x;
 
   AttentionTile<Params> tile(params);
 
-  const int group_size = params.n_heads / params.n_kv_heads;
+  // preprocess input parameters
+  const int head_dim = params.head_dim;
+  const int group_size = params.group_size;
+  const float logits_soft_cap = params.logits_soft_cap;
+  const float sm_scale = params.sm_scale;
+  const float sm_scale_log2 = params.sm_scale_log2;
+
   // ProblemShape
-  // (q_len, HEAD_DIM)
-  auto [Q, O] = tile.template get_qo_tile<DType>(batch_idx, head_idx);
+  // (q_packed_len, HEAD_DIM)
+  auto [Q, O] = tile.template get_qo_tile<DType>(batch_idx, kv_head_idx);
   // (kv_len, HEAD_DIM)
-  auto [K, V] =
-      tile.template get_kv_tile<DType>(batch_idx, head_idx / group_size);
+  auto [K, V] = tile.template get_kv_tile<DType>(batch_idx, kv_head_idx);
 
-  const int q_len = size<0>(Q);
+  const int q_packed_len = size<0>(Q);
+  const int q_len = q_packed_len / group_size;
   const int kv_len = size<0>(K);
 
-  if (m_block * kBlockM >= q_len) {
+  if (m_block * kBlockM >= q_packed_len) {
     // m out of bound, return
     return;
   }
 
-  const int head_dim = params.head_dim;
   const int sliding_window = LOCAL ? params.sliding_window : kv_len;
-  const float logits_soft_cap = params.logits_soft_cap;
-  const float sm_scale = params.sm_scale;
-  const float sm_scale_log2 = params.sm_scale_log2;
-  const float alibi_slope =
-      ALIBI ? (params.alibi_slopes_ptr[head_idx] / sm_scale) : 0.0f;
-
-  // preprocess input parameters
-  auto apply_logits_soft_cap = [&](auto& tSrAccS) {
-    if constexpr (SOFT_CAP) {
-      CUTE_UNROLL
-      for (int i = 0; i < size(tSrAccS); ++i) {
-        tSrAccS(i) = ptx::tanh(tSrAccS(i) * logits_soft_cap);
-      }
-    }
-  };
 
   // Gmem
   // (BLK_M, HEAD_DIM)
@@ -136,7 +126,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
   auto produce_query = [&]() {
     auto tQgQ = gmem_thr_copy_Q.partition_S(gQ);
     auto tQsQ = gmem_thr_copy_Q.partition_D(sQ);
-    auto max_coord = make_coord(q_len - m_block * kBlockM, head_dim);
+    auto max_coord = make_coord(q_packed_len - m_block * kBlockM, head_dim);
     safe_copy</*EVEN_MN=*/false, EVEN_K, /*ZFILL_MN=*/true, /*ZFILL_K=*/true>(
         gmem_tiled_copy_Q, tQgQ, tQsQ, tQcQ, max_coord);
   };
@@ -285,7 +275,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     // wait for smem copy done before gmem copy
     __syncthreads();
 
-    auto max_coord = make_coord(q_len - m_block * kBlockM, head_dim);
+    auto max_coord = make_coord(q_packed_len - m_block * kBlockM, head_dim);
     safe_copy</*EVEN_MN=*/false, EVEN_K, /*ZFILL_MN=*/false, /*ZFILL_K=*/false>(
         gmem_tiled_copy_O, tOsO, tOgO, tOcO, max_coord);
   };
@@ -296,7 +286,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
       make_tensor(tOrAccO.data(), Layout::to_rowcol(tOrAccO.layout()));
   clear(tOrAccO);
 
-  const int diagonal = m_block * kBlockM + kv_len - q_len;
+  const int diagonal = (m_block * kBlockM) / group_size + kv_len - q_len;
   // process kv in range: [kv_idx_min, kv_idx_max)
   const int kv_idx_min = std::max(0, diagonal - sliding_window);
   const int kv_idx_max = std::min(kv_len, diagonal + kBlockM);
@@ -319,15 +309,35 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
   cp_async_fence();
 
   // ###############  Mainloop  ###############
-
-  OnlineSoftmax<kRowsPerMMA * size<1>(tOrAccO)> softmax(sm_scale_log2);
-  Mask<kBlockM, kBlockM, ALIBI, LOCAL> mask(
-      q_len, kv_len, sliding_window, alibi_slope);
-
   // attention score accumulator, (MMA,MMA_M,MMA_N)
   auto tSrAccS = partition_fragment_C(tiled_mma, Shape<_BLK_M, _BLK_N>{});
   auto tSrAccS_rc_view =
       make_tensor(tSrAccS.data(), Layout::to_rowcol(tSrAccS.layout()));
+
+  auto apply_logits_soft_cap = [&](auto& tSrAccS) {
+    if constexpr (SOFT_CAP) {
+      CUTE_UNROLL
+      for (int i = 0; i < size(tSrAccS); ++i) {
+        tSrAccS(i) = ptx::tanh(tSrAccS(i) * logits_soft_cap);
+      }
+    }
+  };
+
+  constexpr int kMMA_M = size<1>(tSrAccS);
+  using Softmax = OnlineSoftmax<kRowsPerMMA * kMMA_M>;
+  using Mask = Mask<kBlockM, kBlockM, kRowsPerMMA, kMMA_M, ALIBI, LOCAL>;
+
+  Softmax softmax(sm_scale_log2);
+  Mask mask(tidx,
+            m_block,
+            q_len,
+            kv_len,
+            kv_head_idx,
+            group_size,
+            sliding_window,
+            sm_scale,
+            params.alibi_slopes_ptr);
+
   // seperate oob mask iterations for better performance
   constexpr int n_oob_mask = cute::ceil_div(kBlockM, kBlockN) + 1;
 
@@ -354,7 +364,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     if constexpr (SOFT_CAP) {
       apply_logits_soft_cap(tSrAccS);
     }
-    mask.apply(tSrAccS_rc_view, m_block, n_block_idx, tidx);
+    mask.apply(tSrAccS_rc_view, n_block_idx);
     softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
 
     // wait value, [v] => []
@@ -396,7 +406,7 @@ __global__ void mha_kernel_sm80(__grid_constant__ const Params params) {
     if constexpr (SOFT_CAP) {
       apply_logits_soft_cap(tSrAccS);
     }
-    mask.apply</*OOB_MASK=*/false>(tSrAccS_rc_view, m_block, n_block_idx, tidx);
+    mask.apply</*OOB_MASK=*/false>(tSrAccS_rc_view, n_block_idx);
     softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
 
     // wait value, [v] => []
 
@@ -7,7 +7,6 @@
 #include "attention_params.h"
 #include "attention_ref.h"
 #include "cute/layout.hpp"
-#include "static_dispatch.h"
 
 namespace llm {
 #define DISPATCH_HEAD_DIM_(HEAD_DIM_V, HEAD_DIM_NAME, ...) \
 
@@ -1,5 +1,8 @@
 #pragma once
 
+#include <cute/int_tuple.hpp>
+#include <cute/layout.hpp>
+
 #include "attention_kernel_sm80.cuh"
 #include "attention_traits_sm80.h"
 #include "static_dispatch.h"
@@ -14,17 +17,18 @@ template <typename Traits,
           bool LOCAL>
 void launch_attention_kernel(const Params& params, cudaStream_t stream) {
   const auto batch_size = params.batch_size;
-  const auto n_heads = params.n_heads;
-  const auto max_q_len = params.max_q_len;
+  const auto n_kv_heads = params.n_kv_heads;
+  const auto max_q_packed_len = params.max_q_len * params.group_size;
 
   const auto smem_size = Traits::kSmemSize;
   auto attention_kernel =
       mha_kernel_sm80<Traits, Params, EVEN_K, ALIBI, SOFT_CAP, LOCAL>;
   cudaFuncSetAttribute(
       attention_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
   // TODO: support persistent kernels
-  dim3 grid(
-      (max_q_len + Traits::kBlockM - 1) / Traits::kBlockM, batch_size, n_heads);
+  dim3 grid(cute::ceil_div(max_q_packed_len, Traits::kBlockM),
+            batch_size,
+            n_kv_heads);
   dim3 block = Traits::kThreadNum;
   attention_kernel<<<grid, block, smem_size, stream>>>(params);
 }
 
@@ -44,6 +44,7 @@ struct AttentionParamsCommon {
   float sm_scale_log2 = 0.0;
   int32_t block_shift_right = 0;
   int32_t block_mask = 0;
+  int group_size = 0;
 
   // used to initialize the params that used for performance optimization
   void normalize() {
@@ -66,6 +67,8 @@ struct AttentionParamsCommon {
     }
     sm_scale_log2 = static_cast<float>(sm_scale * M_LOG2E);
 
+    // block size must be power of 2
+    assert(block_size > 0 && (block_size & (block_size - 1)) == 0);
     auto int_log2 = [](int x) {
       int n = 0;
       while (x >>= 1) {
@@ -76,6 +79,9 @@ struct AttentionParamsCommon {
     block_shift_right = int_log2(block_size);
     block_mask = block_size - 1;
 
+    assert(n_heads % n_kv_heads == 0);
+    group_size = n_heads / n_kv_heads;
+
     normalized = true;
   }
 };
@@ -113,7 +119,9 @@ struct VarLenAttentionParams : public AttentionParamsCommon {
 // paged KV cache
 struct PagedKVAttentionParams : public VarLenAttentionParams {
   // Paged KV cache
+  // the first slot id of each block
   const int* __restrict__ block_table = nullptr;
+  // array of length batch_size + 1 holding starting offset of each sequence.
   const int* __restrict__ block_cu_lens = nullptr;
 };
 
 
@@ -24,21 +24,43 @@ struct AttentionTile<AttentionParams> {
 
   // return the query/output tile: (q_len, head_dim)
   template <typename Element>
-  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int head_idx) const {
+  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int kv_head_idx) const {
     // (batch, seq, head, dim)
-    const auto q_offset = batch_idx * get<0>(params_.q_stride) +
-                          head_idx * get<2>(params_.q_stride);
-    const auto o_offset = batch_idx * get<0>(params_.o_stride) +
-                          head_idx * get<2>(params_.o_stride);
-
-    // q[batch_idx, :, head_idx, :]
-    auto q =
-        make_tensor(make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
-                    make_shape(params_.q_len, params_.head_dim),
-                    make_stride(get<1>(params_.q_stride), _1{}));
-    auto o = make_tensor(make_gmem_ptr((Element*)params_.o_ptr + o_offset),
-                         make_shape(params_.q_len, params_.head_dim),
-                         make_stride(get<1>(params_.o_stride), _1{}));
+
+    // packed all q/o in the same kv head group together
+    // q/o [batch, n_tokens, n_heads, dim]
+    //   => q/o [*batch_idx, n_tokens, n_heads, dim]
+    //   => q/o [n_tokens, group_size, n_kv_heads, dim]
+    //   => q/o [n_tokens, group_size, *kv_head_idx, dim]
+    //   => q/o [(group_size, n_tokens), dim]
+    //   => q/o [packed_len, dim]
+    const auto group_size = params_.group_size;
+    const auto head_base = kv_head_idx * group_size;
+    auto packed_idx_to_coord = [group_size, head_base](int packed_idx) {
+      const int idx = packed_idx / group_size;
+      const int offset = packed_idx % group_size;
+      // (group_size, n_tokens)
+      return make_coord(head_base + offset, idx);
+    };
+
+    const auto packed_len = params_.q_len * group_size;
+    const auto q_offset = batch_idx * get<0>(params_.q_stride);
+    auto q = make_gather_tensor(
+        make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<2>(params_.q_stride), get<1>(params_.q_stride)),
+            _1{}),
+        packed_idx_to_coord);
+
+    const auto o_offset = batch_idx * get<0>(params_.o_stride);
+    auto o = make_gather_tensor(
+        make_gmem_ptr((Element*)params_.o_ptr + o_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<2>(params_.o_stride), get<1>(params_.o_stride)),
+            _1{}),
+        packed_idx_to_coord);
     return make_tuple(q, o);
   }
 
@@ -75,24 +97,37 @@ struct AttentionTile<VarLenAttentionParams> {
 
   // return the query tile: (q_len, head_dim)
   template <typename Element>
-  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int head_idx) const {
+  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int kv_head_idx) const {
     const auto begin = params_.q_cu_lens[batch_idx];
     const auto qo_len = params_.q_cu_lens[batch_idx + 1] - begin;
-    // (seq, head, dim)
-    const auto q_offset =
-        begin * get<0>(params_.q_stride) + head_idx * get<1>(params_.q_stride);
-    const auto o_offset =
-        begin * get<0>(params_.o_stride) + head_idx * get<1>(params_.o_stride);
-
-    // q[begin:begin + q_len, head_idx, :]
-    auto q =
-        make_tensor(make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
-                    make_shape(qo_len, params_.head_dim),
-                    make_stride(get<0>(params_.q_stride), _1{}));
-    // o[begin:begin + o_len, head_idx, :]
-    auto o = make_tensor(make_gmem_ptr((Element*)params_.o_ptr + o_offset),
-                         make_shape(qo_len, params_.head_dim),
-                         make_stride(get<0>(params_.o_stride), _1{}));
+
+    const auto group_size = params_.group_size;
+    const auto head_base = kv_head_idx * group_size;
+    auto packed_idx_to_coord = [group_size, head_base](int packed_idx) {
+      const int idx = packed_idx / group_size;
+      const int offset = packed_idx % group_size;
+      // (group_size, n_tokens)
+      return make_coord(head_base + offset, idx);
+    };
+
+    const auto packed_len = qo_len * group_size;
+    const auto q_offset = begin * get<0>(params_.q_stride);
+    auto q = make_gather_tensor(
+        make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<1>(params_.q_stride), get<0>(params_.q_stride)),
+            _1{}),
+        packed_idx_to_coord);
+
+    const auto o_offset = begin * get<0>(params_.o_stride);
+    auto o = make_gather_tensor(
+        make_gmem_ptr((Element*)params_.o_ptr + o_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<1>(params_.o_stride), get<0>(params_.o_stride)),
+            _1{}),
+        packed_idx_to_coord);
     return make_tuple(q, o);
   }
 
@@ -132,24 +167,36 @@ struct AttentionTile<PagedKVAttentionParams> {
 
   // return the query/output tile: (q_len, head_dim)
   template <typename Element>
-  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int head_idx) const {
+  CUTE_HOST_DEVICE auto get_qo_tile(int batch_idx, int kv_head_idx) const {
     const auto begin = params_.q_cu_lens[batch_idx];
     const auto qo_len = params_.q_cu_lens[batch_idx + 1] - begin;
-    // (seq, head, dim)
-    const auto q_offset =
-        begin * get<0>(params_.q_stride) + head_idx * get<1>(params_.q_stride);
-    const auto o_offset =
-        begin * get<0>(params_.o_stride) + head_idx * get<1>(params_.o_stride);
-
-    // q[begin:begin + q_len, head_idx, :]
-    auto q =
-        make_tensor(make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
-                    make_shape(qo_len, params_.head_dim),
-                    make_stride(get<0>(params_.q_stride), _1{}));
-    // o[begin:begin + o_len, head_idx, :]
-    auto o = make_tensor(make_gmem_ptr((Element*)params_.o_ptr + o_offset),
-                         make_shape(qo_len, params_.head_dim),
-                         make_stride(get<0>(params_.o_stride), _1{}));
+    const auto group_size = params_.group_size;
+    const auto head_base = kv_head_idx * group_size;
+    auto packed_idx_to_coord = [group_size, head_base](int packed_idx) {
+      const int idx = packed_idx / group_size;
+      const int offset = packed_idx % group_size;
+      // (group_size, n_tokens)
+      return make_coord(head_base + offset, idx);
+    };
+
+    const auto packed_len = qo_len * group_size;
+    const auto q_offset = begin * get<0>(params_.q_stride);
+    auto q = make_gather_tensor(
+        make_gmem_ptr((const Element*)params_.q_ptr + q_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<1>(params_.q_stride), get<0>(params_.q_stride)),
+            _1{}),
+        packed_idx_to_coord);
+
+    const auto o_offset = begin * get<0>(params_.o_stride);
+    auto o = make_gather_tensor(
+        make_gmem_ptr((Element*)params_.o_ptr + o_offset),
+        make_shape(packed_len, params_.head_dim),
+        make_stride(
+            make_stride(get<1>(params_.o_stride), get<0>(params_.o_stride)),
+            _1{}),
+        packed_idx_to_coord);
     return make_tuple(q, o);
   }
 
 
@@ -3,6 +3,8 @@
 #include <cute/tensor.hpp>
 
 #include "attention_traits_sm80.h"
+#include "cute/layout_composed.hpp"
+#include "gather_tensor.hpp"
 
 namespace llm {