attn: integrate in-house scale attention and use it by default

guocuimi · guocuimi · commit 5151644b90d2 · 2025-01-24T10:22:23.000-08:00
diff --git a/src/kernels/attention/attention_kernel_sm80_test.cu b/src/kernels/attention/attention_kernel_sm80_test.cu
@@ -94,7 +94,8 @@ torch::Tensor attention_sm80(
 }  // namespace
 
 class AttentionKernelTest
-    : public ::testing::TestWithParam<std::tuple<int64_t /*batch_size*/,
+    : public ::testing::TestWithParam<std::tuple<torch::ScalarType /*q_dtype*/,
+                                                 int64_t /*batch_size*/,
                                                  int64_t /*q_len*/,
                                                  int64_t /*kv_len*/,
                                                  int64_t /*n_heads*/,
@@ -111,7 +112,8 @@ class AttentionKernelTest
 };
 
 TEST_P(AttentionKernelTest, MHA) {
-  const auto [batch_size,
+  const auto [dtype,
+              batch_size,
               q_len,
               kv_len,
               n_heads,
@@ -121,7 +123,7 @@ TEST_P(AttentionKernelTest, MHA) {
               alibi,
               sliding_window] = GetParam();
 
-  const auto options = torch::dtype(torch::kHalf).device(torch::kCUDA);
+  const auto options = torch::dtype(dtype).device(torch::kCUDA);
 
   // construct non-contiguous query, key and value
   const auto data = torch::randn(
@@ -143,13 +145,18 @@ TEST_P(AttentionKernelTest, MHA) {
   auto out = attention_sm80(
       query, key, value, alibi_slopes, logits_soft_cap, sliding_window, q_len);
 
-  EXPECT_TRUE(torch::allclose(out, ref_out, /*rtol=*/1e-3, /*atol=*/1e-3));
+  if (dtype == torch::kBFloat16) {
+    EXPECT_TRUE(torch::allclose(out, ref_out, /*rtol=*/1e-2, /*atol=*/1e-2));
+  } else {
+    EXPECT_TRUE(torch::allclose(out, ref_out, /*rtol=*/1e-3, /*atol=*/1e-3));
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
     MHA,
     AttentionKernelTest,
     ::testing::Combine(
+        ::testing::Values(torch::kHalf, torch::kBFloat16),   // q_dtype
         ::testing::Values(1, 2, 4),                          // batch_size
         ::testing::Values(1, 62, 125),                       // q_len
         ::testing::Values(127, 287, 1000),                   // kv_len
diff --git a/src/kernels/attention/attention_launch_sm80.cuh b/src/kernels/attention/attention_launch_sm80.cuh
@@ -51,43 +51,43 @@ void run_attention_kernel(const Params& params, cudaStream_t stream) {
 }  // namespace detail
 
 // user-facing function to run the attention kernel
-template <typename Element, int HEAD_DIM, typename Params>
+template <typename Dtype, int HEAD_DIM, typename Params>
 void run_attention_kernel_sm80(Params& params, cudaStream_t stream = nullptr) {
   // normalize params that for performance optimization
   params.normalize();
 
   // TODO: tune block shape MNK based on the head dim and smem size
   if constexpr (HEAD_DIM == 64) {
-    using Traits = AttentionTraitsSM80<Element,
+    using Traits = AttentionTraitsSM80<Dtype,
                                        HEAD_DIM,
                                        /*BLK_M=*/64,
                                        /*BLK_N=*/64,
                                        /*BLK_K=*/64>;
     detail::run_attention_kernel<Traits>(params, stream);
   } else if constexpr (HEAD_DIM == 96) {
-    using Traits = AttentionTraitsSM80<Element,
+    using Traits = AttentionTraitsSM80<Dtype,
                                        HEAD_DIM,
                                        /*BLK_M=*/64,
                                        /*BLK_N=*/64,
                                        /*BLK_K=*/32>;
     detail::run_attention_kernel<Traits>(params, stream);
   } else if constexpr (HEAD_DIM == 128) {
-    using Traits = AttentionTraitsSM80<Element,
+    using Traits = AttentionTraitsSM80<Dtype,
                                        HEAD_DIM,
                                        /*BLK_M=*/64,
                                        /*BLK_N=*/64,
                                        /*BLK_K=*/64>;
     detail::run_attention_kernel<Traits>(params, stream);
   } else if constexpr (HEAD_DIM == 256) {
-    using Traits = AttentionTraitsSM80<Element,
+    using Traits = AttentionTraitsSM80<Dtype,
                                        HEAD_DIM,
                                        /*BLK_M=*/64,
                                        /*BLK_N=*/64,
                                        /*BLK_K=*/64>;
     detail::run_attention_kernel<Traits>(params, stream);
   } else {
     // use the default block size
-    using Traits = AttentionTraitsSM80<Element,
+    using Traits = AttentionTraitsSM80<Dtype,
                                        HEAD_DIM,
                                        /*BLK_M=*/64,
                                        /*BLK_N=*/64,
diff --git a/src/kernels/attention/attn_api.cpp b/src/kernels/attention/attn_api.cpp
@@ -0,0 +1,20 @@
+#include "attn_api.h"
+namespace llm {
+void paged_kv_varlen_mha(
+    torch::Tensor& out,               // [n_tokens, n_heads, head_dim]
+    const torch::Tensor& q,           // [n_tokens, n_heads, head_dim]
+    const torch::Tensor& k_cache,     // [n_slots, n_kv_heads, head_dim]
+    const torch::Tensor& v_cache,     // [n_slots, n_kv_heads, head_dim]
+    const torch::Tensor& q_cu_lens,   // [batch + 1]
+    const torch::Tensor& kv_cu_lens,  // [batch + 1]
+    const torch::Tensor& block_table,
+    const torch::Tensor& block_cu_lens,                // [batch + 1]
+    const std::optional<torch::Tensor>& alibi_slopes,  // [n_heads]
+    int block_size,
+    int max_q_len,
+    int max_kv_len,
+    float sm_scale,
+    float logits_soft_cap,
+    int sliding_window) {}
+
+}  // namespace llm
diff --git a/src/kernels/attention/attn_api.h b/src/kernels/attention/attn_api.h
@@ -0,0 +1,29 @@
+#include <torch/torch.h>
+#include <torch/types.h>
+
+namespace llm {
+// the input tensors are packed into one-dimensional tensors, and the sequence
+// lengths are stored in q_cu_lens and k_cu_lens.
+// for each sequence,
+// the starting offset: q/kv_cu_lens[i]
+// the length: q/kv_cu_lens[i+1] - q/kv_cu_lens[i].
+// the maximum sequence length is max_q_len and max_kv_len, which are used
+// to decide the kernel dispatch.
+void paged_kv_varlen_mha(
+    torch::Tensor& out,               // [n_tokens, n_heads, head_dim]
+    const torch::Tensor& q,           // [n_tokens, n_heads, head_dim]
+    const torch::Tensor& k_cache,     // [n_slots, n_kv_heads, head_dim]
+    const torch::Tensor& v_cache,     // [n_slots, n_kv_heads, head_dim]
+    const torch::Tensor& q_cu_lens,   // [batch + 1]
+    const torch::Tensor& kv_cu_lens,  // [batch + 1]
+    const torch::Tensor& block_table,
+    const torch::Tensor& block_cu_lens,                // [batch + 1]
+    const std::optional<torch::Tensor>& alibi_slopes,  // [n_heads]
+    int block_size,
+    int max_q_len,
+    int max_kv_len,
+    float sm_scale,
+    float logits_soft_cap,
+    int sliding_window);
+
+}  // namespace llm
diff --git a/src/layers/attention/CMakeLists.txt b/src/layers/attention/CMakeLists.txt
@@ -9,21 +9,20 @@ cc_library(
     handler.h
     ref_handler.h
     flash_attn_handler.h
-    flash_infer_handler.h
+    scale_attn_handler.h
     attention.h
   SRCS 
     handler.cpp
     ref_handler.cpp
     flash_attn_handler.cpp
-    flash_infer_handler.cpp
+    scale_attn_handler.cpp
     attention.cpp
   DEPS
     :state_dict
     :memory
     :pos_embedding
     :kernels
     :flash_attn.kernels
-    # :flash_infer.kernels
     glog::glog
     gflags::gflags
     torch
diff --git a/src/layers/attention/scale_attn_handler.cpp b/src/layers/attention/scale_attn_handler.cpp
@@ -1,4 +1,4 @@
-#include "flash_infer_handler.h"
+#include "scale_attn_handler.h"
 
 #include <torch/torch.h>
 
@@ -7,7 +7,7 @@
 
 namespace llm {
 
-FlashInferHandler::FlashInferHandler(float scale,
+ScaleAttnHandler::ScaleAttnHandler(float scale,
                                      int64_t rotary_dim,
                                      int64_t max_position,
                                      float rope_scaling,
@@ -17,13 +17,13 @@ FlashInferHandler::FlashInferHandler(float scale,
   LOG(FATAL) << "Not implemented yet";
 }
 
-FlashInferHandler::FlashInferHandler(
+ScaleAttnHandler::ScaleAttnHandler(
     float scale,
     torch::optional<torch::Tensor> alibi_slopes)
     : scale_(scale), alibi_slopes_(alibi_slopes) {}
 
 // batch prefill for attention, optimized for prefill stage
-void FlashInferHandler::batch_prefill(
+void ScaleAttnHandler::batch_prefill(
     const torch::Tensor& query,           // [n_tokens, n_heads, head_dim]
     const torch::Tensor& key,             // [n_tokens, n_kv_heads, head_dim]
     const torch::Tensor& value,           // [n_tokens, n_kv_heads, head_dim]
@@ -36,7 +36,7 @@ void FlashInferHandler::batch_prefill(
 
 // batch decode for attention, optimized for decode stage
 // support multiple queries: one sequence with multiple query tokens
-void FlashInferHandler::batch_decode(
+void ScaleAttnHandler::batch_decode(
     const torch::Tensor& query,           // [n_tokens, n_heads, head_dim]
     const KVCache& kv_cache,              // where to retrieval key and value
     const InputParameters& input_params,  // input paras used for attention
@@ -47,7 +47,7 @@ void FlashInferHandler::batch_decode(
 }
 
 // append key and value to kv_cache
-void FlashInferHandler::append_kv_cache(
+void ScaleAttnHandler::append_kv_cache(
     KVCache& kv_cache,           // where to store key and value
     const torch::Tensor& key,    // [n_tokens, n_kv_heads, head_dim]
     const torch::Tensor& value,  // [n_tokens, n_kv_heads, head_dim]
diff --git a/src/layers/attention/scale_attn_handler.h b/src/layers/attention/scale_attn_handler.h
@@ -9,10 +9,10 @@
 namespace llm {
 
 // an flash attn implementation for attention operations
-class FlashInferHandler : public AttentionHandler {
+class ScaleAttnHandler : public AttentionHandler {
  public:
   // create a flash attn handler with rope positional embedding
-  FlashInferHandler(float scale,
+  ScaleAttnHandler(float scale,
                     int64_t rotary_dim,
                     int64_t max_position,
                     float rope_scaling,
@@ -21,9 +21,9 @@ class FlashInferHandler : public AttentionHandler {
                     const torch::TensorOptions& options);
 
   // constructor for attention with alibi
-  FlashInferHandler(float scale, torch::optional<torch::Tensor> alibi_slopes);
+  ScaleAttnHandler(float scale, std::optional<torch::Tensor> alibi_slopes);
 
-  virtual ~FlashInferHandler() = default;
+  virtual ~ScaleAttnHandler() = default;
 
   std::tuple<torch::Tensor, torch::Tensor> apply_pos_emb(
       const torch::Tensor& query,
@@ -63,7 +63,7 @@ class FlashInferHandler : public AttentionHandler {
   float scale_ = 0.0;
 
   // alibi slops
-  torch::optional<torch::Tensor> alibi_slopes_;
+  std::optional<torch::Tensor> alibi_slopes_;
 };
 
 }  // namespace llm