Tune kernel compilation parameters for #1850 (#1878)

kahyunnam · web-flow · commit 792dcb1b616f · 2025-10-06T17:53:26.000-07:00
## 📌 Description A follow up to #1850 to adjust pipeline stage / tile size values for perf improvement (benchmarking results below). Also adjust test param to test realistic combinations. ## 🧪 Test results ### Unit testing results (tests/attention/test_blackwell_fmha.py) ``` =============================================================== 3096 passed, 240 skipped in 189.17s (0:03:09) ================================================================ ``` ### Benchmarking results (benchmarks/bench_blackwell_attention.py) (before) ``` === head_dim=64 === bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=False), flops: 448.139 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=False), flops: 520.066 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=False), flops: 595.861 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=False), flops: 653.053 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=False), flops: 671.899 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=False), flops: 788.719 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=False), flops: 869.262 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=False), flops: 868.034 TFLOPs/s bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=True), flops: 261.792 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=True), flops: 374.697 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=True), flops: 476.372 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=True), flops: 543.667 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=True), flops: 642.878 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=True), flops: 720.390 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=True), flops: 721.056 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=True), flops: 756.090 TFLOPs/s ``` (after) ``` bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=False), flops: 695.429 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=False), flops: 876.748 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=False), flops: 985.989 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=False), flops: 1049.088 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=False), flops: 1093.423 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=False), flops: 1119.016 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=False), flops: 1138.080 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=False), flops: 1151.325 TFLOPs/s bench_fmha_blackwell (batch_size=128, qkv_len=512, num_heads=32, head_dim=64, causal=True), flops: 273.278 TFLOPs/s bench_fmha_blackwell (batch_size=64, qkv_len=1024, num_heads=32, head_dim=64, causal=True), flops: 416.845 TFLOPs/s bench_fmha_blackwell (batch_size=32, qkv_len=2048, num_heads=32, head_dim=64, causal=True), flops: 616.595 TFLOPs/s bench_fmha_blackwell (batch_size=16, qkv_len=4096, num_heads=32, head_dim=64, causal=True), flops: 810.543 TFLOPs/s bench_fmha_blackwell (batch_size=8, qkv_len=8192, num_heads=32, head_dim=64, causal=True), flops: 940.429 TFLOPs/s bench_fmha_blackwell (batch_size=4, qkv_len=16384, num_heads=32, head_dim=64, causal=True), flops: 1028.673 TFLOPs/s bench_fmha_blackwell (batch_size=2, qkv_len=32768, num_heads=32, head_dim=64, causal=True), flops: 1083.968 TFLOPs/s bench_fmha_blackwell (batch_size=1, qkv_len=65536, num_heads=32, head_dim=64, causal=True), flops: 1131.110 TFLOPs/s ``` ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/fmha_cutlass_sm100.cu b/csrc/fmha_cutlass_sm100.cu
@@ -102,7 +102,7 @@ void FMHACutlassSM100Run(ffi::Tensor workspace_buffer, ffi::Tensor q, ffi::Tenso
     using cutlass_type_in = cutlass_dtype_t<DTypeIn>;
     using cutlass_type_out = cutlass_dtype_t<DTypeOut>;
     using TILE_Q = _256;
-    using TILE_KV = std::conditional_t<HEAD_DIM_QK == 64, _64, _128>;
+    using TILE_KV = _128;
     using D_QK = cute::Int<HEAD_DIM_QK>;
     using D_VO = cute::Int<HEAD_DIM_VO>;
     using TileShapeQK = Shape<TILE_Q, TILE_KV, D_QK>;
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -64,8 +64,9 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
   using Mask = Mask_;
 
   static constexpr int StageCountQ = 2;
-  static constexpr int StageCountKV =
-      get<2>(TileShapeQK{}) == 128 ? 2 : 1;  // sizeof(Element_) == 1 ? 2 : 2;
+  static constexpr int StageCountKV = (get<2>(TileShapeQK{}) == 128 || get<2>(TileShapeQK{}) == 64)
+                                          ? 2
+                                          : 1;  // sizeof(Element_) == 1 ? 2 : 2;
 
   using StagesQ = cutlass::gemm::collective::StageCount<StageCountQ>;
   using StagesKV = cutlass::gemm::collective::StageCount<StageCountKV>;
diff --git a/tests/attention/test_blackwell_fmha.py b/tests/attention/test_blackwell_fmha.py
@@ -90,8 +90,14 @@ def attention_varlen_ref(
 @pytest.mark.parametrize("kv_len", [1, 17, 544, 977, 1999])
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
-@pytest.mark.parametrize("sm_scale", [1.0, 1.0 / math.sqrt(192), 1.0 / math.sqrt(128)])
+@pytest.mark.parametrize(
+    "head_dim_qk,head_dim_vo,sm_scale",
+    [
+        (192, 128, 1.0 / math.sqrt(192)),
+        (128, 128, 1.0 / math.sqrt(128)),
+        (64, 64, 1.0 / math.sqrt(64)),
+    ],
+)
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 def test_blackwell_cutlass_fmha(
@@ -168,8 +174,14 @@ def test_blackwell_cutlass_fmha(
 @pytest.mark.parametrize("indptr", VARLEN_INDPTR_PARAMS)
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
-@pytest.mark.parametrize("sm_scale", [1.0 / math.sqrt(128)])
+@pytest.mark.parametrize(
+    "head_dim_qk,head_dim_vo,sm_scale",
+    [
+        (192, 128, 1.0 / math.sqrt(192)),
+        (128, 128, 1.0 / math.sqrt(128)),
+        (64, 64, 1.0 / math.sqrt(64)),
+    ],
+)
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 def test_blackwell_cutlass_varlen(
@@ -249,8 +261,14 @@ def test_blackwell_cutlass_varlen(
 @pytest.mark.parametrize("kv_indptr_list", [[0, 50, 50, 50, 50, 50, 50, 50]])
 @pytest.mark.parametrize("num_qo_heads", [32])
 @pytest.mark.parametrize("num_kv_heads", [8, 32])
-@pytest.mark.parametrize("head_dim_qk,head_dim_vo", [(192, 128), (128, 128), (64, 64)])
-@pytest.mark.parametrize("sm_scale", [1.0 / math.sqrt(128)])
+@pytest.mark.parametrize(
+    "head_dim_qk,head_dim_vo,sm_scale",
+    [
+        (192, 128, 1.0 / math.sqrt(192)),
+        (128, 128, 1.0 / math.sqrt(128)),
+        (64, 64, 1.0 / math.sqrt(64)),
+    ],
+)
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_blackwell_cutlass_qo_kv_varlen(
     qo_indptr_list,