Add head_dim=64 for blackwell cutlass fmha implementation (#1850)

kahyunnam · web-flow · commit f3ea938dbb22 · 2025-10-03T16:14:17.000-07:00
## 📌 Description This PR adds support for head_dim=64 for blackwell cutlass fmha. This expanded support is added to the unit tests and benchmarking scripts. The benchmarking script was used to check for the optimal stageCountKV (hypothesized that it may be 3 for 64; ended up being 1). ## 🧪 Test Results (on B300, CUDA 13.0) ```pytest tests/attention/test_blackwell_fmha.py ``` ```=============================================================== 5616 passed, 720 skipped in 203.34s (0:03:23) ================================================================``` ```python benchmarks/bench_blackwell_attention.py``` ``` === head_dim=128 === bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=128, causal=False), flops: 1024.563 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=128, causal=False), flops: 1234.186 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=128, causal=False), flops: 1386.312 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=128, causal=False), flops: 1496.488 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=128, causal=False), flops: 1540.769 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=128, causal=False), flops: 1605.068 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=128, causal=False), flops: 1648.648 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=128, causal=False), flops: 1658.047 TFLOPs/s bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=128, causal=True), flops: 440.781 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=128, causal=True), flops: 638.431 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=128, causal=True), flops: 963.078 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=128, causal=True), flops: 1223.670 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=128, causal=True), flops: 1379.715 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=128, causal=True), flops: 1497.805 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=128, causal=True), flops: 1584.493 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=128, causal=True), flops: 1638.206 TFLOPs/s === head_dim=64 === bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=False), flops: 449.641 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=False), flops: 520.870 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=False), flops: 596.860 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=False), flops: 654.122 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=False), flops: 673.011 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=False), flops: 791.186 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=False), flops: 872.266 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=False), flops: 870.826 TFLOPs/s bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=True), flops: 262.144 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=True), flops: 375.960 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=True), flops: 477.245 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=True), flops: 544.132 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=True), flops: 644.116 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=True), flops: 721.476 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=True), flops: 723.058 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=True), flops: 758.397 TFLOPs/s ``` ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/benchmarks/bench_blackwell_attention.py b/benchmarks/bench_blackwell_attention.py
@@ -81,6 +81,7 @@ def flops(ms):
 
 
 if __name__ == "__main__":
+    print("\n === head_dim=128 ===")
     bench_fmha_blackwell(128, 512, 32, 128, False, torch.bfloat16)
     bench_fmha_blackwell(64, 1024, 32, 128, False, torch.bfloat16)
     bench_fmha_blackwell(32, 2048, 32, 128, False, torch.bfloat16)
@@ -98,3 +99,22 @@ def flops(ms):
     bench_fmha_blackwell(4, 16384, 32, 128, True, torch.bfloat16)
     bench_fmha_blackwell(2, 32768, 32, 128, True, torch.bfloat16)
     bench_fmha_blackwell(1, 65536, 32, 128, True, torch.bfloat16)
+
+    print("\n === head_dim=64 ===")
+    bench_fmha_blackwell(128, 512, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(64, 1024, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(32, 2048, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(16, 4096, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(8, 8192, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(4, 16384, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(2, 32768, 32, 64, False, torch.bfloat16)
+    bench_fmha_blackwell(1, 65536, 32, 64, False, torch.bfloat16)
+
+    bench_fmha_blackwell(128, 512, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(64, 1024, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(32, 2048, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(16, 4096, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(8, 8192, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(4, 16384, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(2, 32768, 32, 64, True, torch.bfloat16)
+    bench_fmha_blackwell(1, 65536, 32, 64, True, torch.bfloat16)
diff --git a/csrc/fmha_cutlass_sm100.cu b/csrc/fmha_cutlass_sm100.cu
@@ -43,6 +43,10 @@ using tvm::ffi::Optional;
       constexpr int HEAD_DIM_QK = 128;                                             \
       constexpr int HEAD_DIM_VO = 128;                                             \
       return __VA_ARGS__();                                                        \
+    } else if (head_dim_qk == 64 && head_dim_vo == 64) {                           \
+      constexpr int HEAD_DIM_QK = 64;                                              \
+      constexpr int HEAD_DIM_VO = 64;                                              \
+      return __VA_ARGS__();                                                        \
     }                                                                              \
     return false;                                                                  \
   }()
@@ -98,7 +102,7 @@ void FMHACutlassSM100Run(ffi::Tensor workspace_buffer, ffi::Tensor q, ffi::Tenso
     using cutlass_type_in = cutlass_dtype_t<DTypeIn>;
     using cutlass_type_out = cutlass_dtype_t<DTypeOut>;
     using TILE_Q = _256;
-    using TILE_KV = _128;
+    using TILE_KV = std::conditional_t<HEAD_DIM_QK == 64, _64, _128>;
     using D_QK = cute::Int<HEAD_DIM_QK>;
     using D_VO = cute::Int<HEAD_DIM_VO>;
     using TileShapeQK = Shape<TILE_Q, TILE_KV, D_QK>;
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -857,8 +857,8 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
 
     float2 scale_f32x2 = make_float2(scale, scale);
 
-    Tensor tTMrO =
-        make_tensor<ElementPV>(make_shape(shape(tTMEM_LOADcO), Int<128 / kCorrectionTileSize>{}));
+    Tensor tTMrO = make_tensor<ElementPV>(
+        make_shape(shape(tTMEM_LOADcO), Int<get<1>(TileShapePV{}) / kCorrectionTileSize>{}));
 
     auto copy_in = [&](int i) {
       Tensor tTMEM_LOADtO_i = tTMEM_LOADtO;
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
@@ -785,7 +785,7 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     // loop:
     //   TMEM_LOAD, TMEM_LOAD, FMUL2, FFMA2, STG
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 128 / kCorrectionTileSize; i++) {
+    for (int i = 0; i < get<2>(TileShape{}) / kCorrectionTileSize; i++) {
       Tensor tTMEM_LOADtO0_i = tTMEM_LOADtO0;
       tTMEM_LOADtO0_i.data() = tTMEM_LOADtO0_i.data().get() + uint32_t(i * kCorrectionTileSize);
       Tensor tTMEM_LOADtO1_i = tTMEM_LOADtO1;
@@ -867,8 +867,8 @@ struct Sm100FmhaGenMainloopWarpspecialized {
 
     float2 scale_f32x2 = make_float2(scale, scale);
 
-    Tensor tTMrO =
-        make_tensor<ElementPV>(make_shape(shape(tTMEM_LOADcO), Int<128 / kCorrectionTileSize>{}));
+    Tensor tTMrO = make_tensor<ElementPV>(
+        make_shape(shape(tTMEM_LOADcO), Int<get<2>(TileShape{}) / kCorrectionTileSize>{}));
 
     auto copy_in = [&](int i) {
       Tensor tTMEM_LOADtO_i = tTMEM_LOADtO;
diff --git a/tests/attention/test_blackwell_fmha.py b/tests/attention/test_blackwell_fmha.py
@@ -90,8 +90,7 @@ def attention_varlen_ref(
 @pytest.mark.parametrize("kv_len", [1, 17, 544, 977, 1999])
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk", [192, 128])
-@pytest.mark.parametrize("head_dim_vo", [128])
+@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
 @pytest.mark.parametrize("sm_scale", [1.0, 1.0 / math.sqrt(192), 1.0 / math.sqrt(128)])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -169,8 +168,7 @@ def test_blackwell_cutlass_fmha(
 @pytest.mark.parametrize("indptr", VARLEN_INDPTR_PARAMS)
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk", [192, 128])
-@pytest.mark.parametrize("head_dim_vo", [128])
+@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
 @pytest.mark.parametrize("sm_scale", [1.0 / math.sqrt(128)])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -251,8 +249,7 @@ def test_blackwell_cutlass_varlen(
 @pytest.mark.parametrize("kv_indptr_list", [[0, 50, 50, 50, 50, 50, 50, 50]])
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk", [192, 128])
-@pytest.mark.parametrize("head_dim_vo", [128])
+@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
 @pytest.mark.parametrize("sm_scale", [1.0 / math.sqrt(128)])
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_blackwell_cutlass_qo_kv_varlen(