feat: added kernel builder for attn (#493)

guocuimi · web-flow · commit 8d41e66909da · 2025-07-23T10:46:14.000-07:00
diff --git a/src/kernels/attention/common/fmha_block.h b/src/kernels/attention/common/fmha_block.h
@@ -14,17 +14,13 @@ using namespace cute;
 // AttentionTile specialization for AttentionParams
 template <typename TileShape,  // (BLK_M, BLK_N, BLK_K)
           typename Element,    // Element type
+          typename StrideQ,    // (B, Q, H, D)
+          typename StrideK,    // (B, Q, H, D)
+          typename StrideV,    // (B, Q, KH, D)
+          typename StrideO,    // (B, Q, KH, D)
           bool kLocal>
 struct FmhaBlock {
-  // (B, Q, H, D)
-  using StrideQ = Stride<int64_t, int64_t, int64_t, _1>;
-  using StrideO = StrideQ;
-  // (B, K, KH, D)
-  using StrideK = Stride<int64_t, int64_t, int64_t, _1>;
-  using StrideV = StrideK;
-
   // Host side parameters
-
   struct Arguments {
     const void* __restrict__ q_ptr;
     const void* __restrict__ k_ptr;
diff --git a/src/kernels/attention/fmha_runner.h b/src/kernels/attention/fmha_runner.h
@@ -5,20 +5,14 @@
 #include <cute/layout.hpp>
 #include <cute/tensor.hpp>
 
-#include "collective/sm120_collective_epilogue.cuh"
-#include "collective/sm120_collective_fmha_mainloop_ws.cuh"
-#include "common/fmha_block.h"
-#include "common/tile_scheduler.cuh"
 #include "device/fmha.cuh"
 #include "fmha_params.h"
-#include "kernel/sm120_kernel_fmha_ws.cuh"
+#include "kernel/kernel_builder.h"  // IWYU pragma: keep
 
 namespace llm {
-// ? Should include ArchTag?
-//  * select right kernel based on ArchTag?
-// ? how to support fast compliling?
+// TODO: support fast compliling
 //  * only compile the kernel for the target compute capability
-template <typename Element, int kHeadDim>
+template <class ArchTag, typename Element, int kHeadDim>
 class FmhaRunner {
  public:
   static bool run(const FmhaParams& params, cudaStream_t stream = nullptr) {
@@ -64,26 +58,25 @@ class FmhaRunner {
 
     using TileShape = Shape<Int<BLK_M>, Int<BLK_N>, Int<kHeadDim>>;
 
-    using Block = FmhaBlock<TileShape, Element, LOCAL>;
-
-    using CollectiveMainloop = Sm120CollectiveFMhaWs<TileShape,
-                                                     Element,
-                                                     EVEN_K,
-                                                     ALIBI,
-                                                     SOFT_CAP,
-                                                     LOCAL,
-                                                     KV_USE_TMA>;
-    using CollectiveEpilogue =
-        Sm120CollectiveEpilogue<TileShape, Element, EVEN_K>;
-
-    // TODO: support persistent kernels
-    using TileScheduler = SingleTileScheduler;
-
-    using AttnKernel = Sm120KernelFmhaWs<ProblemShape,
-                                         Block,
-                                         CollectiveMainloop,
-                                         CollectiveEpilogue,
-                                         TileScheduler>;
+    // (B, Q, H, D)
+    using StrideQ = Stride<int64_t, int64_t, int64_t, _1>;
+    using StrideK = Stride<int64_t, int64_t, int64_t, _1>;
+    using StrideV = StrideK;
+    using StrideO = StrideQ;
+
+    using AttnKernel = typename KernelBuilder<ArchTag,
+                                              ProblemShape,
+                                              TileShape,
+                                              Element,
+                                              StrideQ,
+                                              StrideK,
+                                              StrideV,
+                                              StrideO,
+                                              EVEN_K,
+                                              ALIBI,
+                                              SOFT_CAP,
+                                              LOCAL,
+                                              KV_USE_TMA>::Kernel;
 
     assert(params.n_heads % params.n_kv_heads == 0 &&
            "n_heads must be divisible by n_kv_heads");
diff --git a/src/kernels/attention/kernel/builders/kernel_builder_decl.h b/src/kernels/attention/kernel/builders/kernel_builder_decl.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cute/util/type_traits.hpp>
+
+namespace llm {
+
+template <class ArchTag,
+          class ProblemShape,
+          class TileShape,
+          class Element,
+          class StrideQ,
+          class StrideK,
+          class StrideV,
+          class StrideO,
+          bool EVEN_K,
+          bool ALIBI,
+          bool SOFT_CAP,
+          bool LOCAL,
+          bool KV_USE_TMA,
+          class Enable = void>
+struct KernelBuilder {
+  static_assert(cute::dependent_false<Element>,
+                "Could not build a kernel for given parameters.");
+};
+
+}  // namespace llm
diff --git a/src/kernels/attention/kernel/builders/sm120_kernel_builder.inl b/src/kernels/attention/kernel/builders/sm120_kernel_builder.inl
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <cutlass/arch/arch.h>
+
+#include <cute/tensor.hpp>
+
+#include "collective/sm120_collective_epilogue.cuh"
+#include "collective/sm120_collective_fmha_mainloop_ws.cuh"
+#include "common/fmha_block.h"
+#include "common/tile_scheduler.cuh"
+#include "kernel/sm120_kernel_fmha_ws.cuh"
+#include "kernel_builder_decl.h"
+
+namespace llm {
+
+template <class ProblemShape,
+          class TileShape,
+          class Element,
+          class StrideQ,
+          class StrideK,
+          class StrideV,
+          class StrideO,
+          bool EVEN_K,
+          bool ALIBI,
+          bool SOFT_CAP,
+          bool LOCAL,
+          bool KV_USE_TMA>
+struct KernelBuilder<cutlass::arch::Sm120,
+                     ProblemShape,
+                     TileShape,
+                     Element,
+                     StrideQ,
+                     StrideK,
+                     StrideV,
+                     StrideO,
+                     EVEN_K,
+                     ALIBI,
+                     SOFT_CAP,
+                     LOCAL,
+                     KV_USE_TMA,
+                     cute::enable_if_t<not cute::is_tuple_v<Element>>> {
+  using Block =
+      FmhaBlock<TileShape, Element, StrideQ, StrideK, StrideV, StrideO, LOCAL>;
+
+  using CollectiveMainloop = Sm120CollectiveFMhaWs<TileShape,
+                                                   Element,
+                                                   EVEN_K,
+                                                   ALIBI,
+                                                   SOFT_CAP,
+                                                   LOCAL,
+                                                   KV_USE_TMA>;
+  using CollectiveEpilogue =
+      Sm120CollectiveEpilogue<TileShape, Element, EVEN_K>;
+
+  // TODO: support persistent kernels
+  using TileScheduler = SingleTileScheduler;
+
+  using Kernel = Sm120KernelFmhaWs<ProblemShape,
+                                   Block,
+                                   CollectiveMainloop,
+                                   CollectiveEpilogue,
+                                   TileScheduler>;
+};
+
+}  // namespace llm
diff --git a/src/kernels/attention/kernel/kernel_builder.h b/src/kernels/attention/kernel/kernel_builder.h
@@ -0,0 +1,7 @@
+#pragma once
+
+// kernel builder declarations
+#include "builders/kernel_builder_decl.h"  // IWYU pragma: keep
+
+// kernel build implementations
+#include "builders/sm120_kernel_builder.inl"  // IWYU pragma: keep
diff --git a/src/kernels/attention/tests/sm120_fmha_test.cu b/src/kernels/attention/tests/sm120_fmha_test.cu
@@ -89,10 +89,10 @@ torch::Tensor sm120_fmha(
                                 : nullptr;
 
   // params.max_q_len = max_q_len;
-
+  using ArchTag = cutlass::arch::Sm120;
   DISPATCH_TORCH_DTYPE_(query.dtype(), Dtype, [&] {
     DISPATCH_HEAD_DIM_(head_dim, HEAD_DIM, [&] {
-      FmhaRunner<Dtype, HEAD_DIM>::run(params, /*stream=*/nullptr);
+      FmhaRunner<ArchTag, Dtype, HEAD_DIM>::run(params, /*stream=*/nullptr);
     });
   });
   return out;