Support bf16 in blackwell cutlass decode attention kernel

Alkaid-Benetnash · facebook-github-bot · commit 0c09cb26e70d · 2025-09-22T19:42:48.000-07:00
Summary:
1. Reduce pipeline stages to avoid exceeding smem limit
2. Add static_assert to make sure smem capacity violation is raised during compilation rather than runtime
3. Select the TMEM intrinsics based on sizeof(Element).
4. Update unittest to include bf16
5. Also label decode kernel test name with their corresponding test parameters.

Differential Revision: D82991495
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu
@@ -262,10 +262,15 @@ struct GenRunner {
 };
 
 // Dispatch macros for different element types
-// TODO(henrylhtsang / ayaoibrahim1123): Add support for other data types.
 #define DISPATCH_ELEMENT_TYPE(DTYPE, ELEMENT_TYPE, ...)                       \
   [&] {                                                                       \
-    if (DTYPE == at::kFloat8_e4m3fn) {                                 \
+    if (DTYPE == at::kHalf) {                                                 \
+      using ELEMENT_TYPE = cutlass::half_t;                                   \
+      return __VA_ARGS__();                                                   \
+    } else if (DTYPE == at::kBFloat16) {                                      \
+      using ELEMENT_TYPE = cutlass::bfloat16_t;                               \
+      return __VA_ARGS__();                                                   \
+    } else if (DTYPE == at::kFloat8_e4m3fn) {                                 \
       using ELEMENT_TYPE = cutlass::float_e4m3_t;                             \
       return __VA_ARGS__();                                                   \
     } else {                                                                  \
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_common.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_common.hpp
@@ -126,3 +126,35 @@ void warpgroup_reg_set() {
 }
 
 }  // namespace cutlass::fmha::collective
+
+namespace constexpr_type_map {
+/*
+ * The following utility type_traits allow mapping constexpr variable to type at
+ * compile time.
+ * The default return type defined for each map would be returned if queried key
+ * does not exist in the map.
+ */
+
+template <auto keyVal, typename _valueT>
+struct cValTypePair {
+  static constexpr auto key = keyVal;
+  using valueT = _valueT;
+};
+
+template <typename Default, typename FirstMapping, typename ...OtherMapping>
+struct TypeMap {
+  template<auto QueryKey>
+  using query = std::conditional_t<
+      QueryKey == FirstMapping::key,
+      typename FirstMapping::valueT,
+      typename TypeMap<Default, OtherMapping...>::template query<QueryKey>
+  >;
+};
+
+template <typename Default, typename LastMapping>
+struct TypeMap<Default, LastMapping> {
+  template<auto QueryKey>
+  using query = std::conditional_t<QueryKey == LastMapping::key, typename LastMapping::valueT, Default>;
+};
+
+}
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
@@ -86,7 +86,7 @@ struct Sm100FmhaGenMainloopWarpspecialized {
   using Mask = Mask_;
 
   static constexpr int StageCountQ = get<1>(TileShape{}) == 256 ? 1 : 2;
-  static constexpr int StageCountKV = 256 * 11 / get<1>(TileShape{});
+  static constexpr int StageCountKV = StageCountQ * ((sizeof(Element) == 1) ? /*fp8*/ 12 : /*bf16/fp16*/5);
   
   using StagesQ = cutlass::gemm::collective::StageCount<StageCountQ>;
   using StagesKV = cutlass::gemm::collective::StageCount<StageCountKV>;
@@ -540,9 +540,23 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     tStS_P.data() = warp_uniform(uint32_t(stage == _0{} ? TmemAllocation::P0 : TmemAllocation::P1));
     Tensor tScS_P = tScS.compose(make_layout(make_shape(_128{}, tilePlikeFP32)));
 
+    using TMEM_LOAD_OPMAP = constexpr_type_map::TypeMap<void,
+        constexpr_type_map::cValTypePair<1, SM100_TMEM_LOAD_32dp32b8x>,
+        constexpr_type_map::cValTypePair<2, SM100_TMEM_LOAD_32dp32b16x>
+    >;
+    using TMEM_STORE_OPMAP = constexpr_type_map::TypeMap<void,
+        constexpr_type_map::cValTypePair<1, SM100_TMEM_STORE_32dp32b8x>,
+        constexpr_type_map::cValTypePair<2, SM100_TMEM_STORE_32dp32b16x>
+    >;
     // Each thread owns a single row
-    using TMEM_LOAD = conditional_t<size<1>(TileShapeQK{}) < _128{}, SM100_TMEM_LOAD_32dp32b8x, SM100_TMEM_LOAD_32dp32b32x>;  // 4x32 threads with 128 cols of 8b elem
-    using TMEM_STORE = conditional_t<size<1>(TileShapeQK{}) < _128{}, SM100_TMEM_STORE_32dp32b8x, SM100_TMEM_STORE_32dp32b32x>;  // 4x32 threads with 128 cols of 8b elem
+    using TMEM_LOAD = conditional_t<size<1>(
+                      TileShapeQK{}) < _128{},
+                      TMEM_LOAD_OPMAP::query<sizeof(Element)>,
+                      SM100_TMEM_LOAD_32dp32b32x>;  // 4x32 threads with 128 cols of 8b elem
+    using TMEM_STORE = conditional_t<size<1>(
+                      TileShapeQK{}) < _128{},
+                      TMEM_STORE_OPMAP::query<sizeof(Element)>,
+                      SM100_TMEM_STORE_32dp32b32x>;  // 4x32 threads with 128 cols of 8b elem
     using TMEM_STORE_V = SM100_TMEM_STORE_32dp32b2x;   // 4x32 threads with 2 cols of 32b elem
 
     int thread_idx = threadIdx.x % (4 * cutlass::NumThreadsPerWarp);
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/device/fmha.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/device/fmha.hpp
@@ -39,6 +39,7 @@
 // common
 #include "cutlass/cutlass.h"
 #include "cutlass/device_kernel.h"
+#include "cutlass/arch/arch.h"
 
 #if !defined(__CUDACC_RTC__)
 #include "cutlass/cluster_launch.hpp"
@@ -57,6 +58,7 @@ template <class Kernel_>
 class FMHA {
 public:
   using Kernel = Kernel_;
+  static_assert(Kernel::SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
 
   static int const kThreadCount = Kernel::MaxThreadsPerBlock;
 
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py
@@ -439,25 +439,31 @@ def _execute_cutlass_blackwell_attn_varlen(
                 seqlen_k,
                 batch_size,
                 is_mqa,
+                q_heads,
+                dtype,
             )
             for seqlen_k in [64, 128, 256, 1024]
             for batch_size in [1, 2]
             for is_mqa in [True]
-        ]
+            for q_heads in [8]
+            for dtype in [torch.float8_e4m3fn, torch.bfloat16]
+        ],
+        name_func=lambda func, num, p: func.__name__
+        + "_"
+        + num
+        + "_"
+        + "_".join(map(str, p)),
     )
     def test_decode(
         self,
         seqlen_k: int,
         batch_size: int,
         is_mqa: bool,
-        q_heads: int = 8,
-        dtype: torch.dtype = torch.float8_e4m3fn,
+        q_heads: int,
+        dtype: torch.dtype,
     ) -> None:
         seqlen_q = 1
         causal = True
-        assert (
-            dtype == torch.float8_e4m3fn
-        ), "Gen Kernel only supports float8_e4m3fn for now"
         self._execute_cutlass_blackwell_attn_dense(
             batch_size,
             seqlen_q,