Make the result consistent with FAv4 benchmark

xuzhao9 · web-flow · commit 19ca5e658f6f · 2025-08-26T19:12:53.000-07:00
Differential Revision: D80984604 Pull Request resolved: #356
diff --git a/tritonbench/operators/blackwell_attentions/generate_inputs.py b/tritonbench/operators/blackwell_attentions/generate_inputs.py
@@ -14,7 +14,7 @@ def _generated_qkv_inputs(
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     requires_grad = True
 
-    BATCH, H, N_CTX, N_CTX_KV, D_HEAD = shape
+    BATCH, H, N_HEADS_KV, N_CTX, N_CTX_KV, D_HEAD = shape
 
     q = torch.randn(
         (BATCH, H, N_CTX, D_HEAD),
@@ -23,13 +23,13 @@ def _generated_qkv_inputs(
         requires_grad=requires_grad,
     )
     k = torch.randn(
-        (BATCH, H, N_CTX_KV, D_HEAD),
+        (BATCH, N_HEADS_KV, N_CTX_KV, D_HEAD),
         dtype=dtype,
         device=device,
         requires_grad=requires_grad,
     )
     v = torch.randn(
-        (BATCH, H, N_CTX_KV, D_HEAD),
+        (BATCH, N_HEADS_KV, N_CTX_KV, D_HEAD),
         dtype=dtype,
         device=device,
         requires_grad=requires_grad,
@@ -42,27 +42,31 @@ def _generated_qkv_inputs(
 
 
 def customized_inputs(shape, num_inputs, dtype, device) -> Generator:
-    BATCH, H, SEQ_LEN, SEQ_LEN_KV, D_HEAD = shape
+    BATCH, H, N_HEADS_KV, SEQ_LEN, SEQ_LEN_KV, D_HEAD = shape
 
     SEQ_LEN_LOG2 = 7
 
     if SEQ_LEN is not None:
         SEQ_LEN_KV = SEQ_LEN if SEQ_LEN_KV is None else SEQ_LEN_KV
         if num_inputs is None:
             yield _generated_qkv_inputs(
-                (BATCH, H, SEQ_LEN, SEQ_LEN_KV, D_HEAD), dtype=dtype, device=device
+                (BATCH, H, N_HEADS_KV, SEQ_LEN, SEQ_LEN_KV, D_HEAD),
+                dtype=dtype,
+                device=device,
             )
         else:
             for _i in range(num_inputs):
                 yield _generated_qkv_inputs(
-                    (BATCH, H, SEQ_LEN, SEQ_LEN, D_HEAD), dtype=dtype, device=device
+                    (BATCH, H, N_HEADS_KV, SEQ_LEN, SEQ_LEN, D_HEAD),
+                    dtype=dtype,
+                    device=device,
                 )
                 SEQ_LEN *= 2
         return
     for i in range(SEQ_LEN_LOG2, 15):
         SEQ_LEN = 2**i
         yield _generated_qkv_inputs(
-            (BATCH, H, SEQ_LEN, SEQ_LEN, D_HEAD), dtype=dtype, device=device
+            (BATCH, H, H, SEQ_LEN, SEQ_LEN, D_HEAD), dtype=dtype, device=device
         )
 
 
diff --git a/tritonbench/operators/blackwell_attentions/operator.py b/tritonbench/operators/blackwell_attentions/operator.py
@@ -47,7 +47,7 @@
 
 # [Optional] CuTe
 try:
-    import flash_attn.cute.interface as facute
+    from flash_attn.cute.interface import flash_attn_func as facute_flash_attn_func
 
     HAS_FLASH_CUTE = True
 except (ImportError, IOError, AttributeError):
@@ -98,6 +98,9 @@ def parse_op_args(args: List[str]):
         "--seq-len-kv", type=int, default=None, help="Sequence length kv"
     )
     parser.add_argument("--n-heads", type=int, default=48, help="Number of heads")
+    parser.add_argument(
+        "--n-heads-kv", type=int, default=None, help="Number of heads kv"
+    )
     parser.add_argument("--d-head", type=int, default=64, help="specify head dimension")
     parser.add_argument(
         "--causal",
@@ -136,6 +139,9 @@ def __init__(
         self.SEQ_LEN_KV = (
             args.seq_len_kv if args.seq_len_kv is not None else args.seq_len
         )
+        self.N_HEAD_KV = (
+            args.n_heads_kv if args.n_heads_kv is not None else args.n_heads
+        )
         self.H = args.n_heads
         self.D_HEAD = args.d_head
         self.causal = args.causal
@@ -288,7 +294,9 @@ def cutedsl_blackwell(
         q = q.transpose(1, 2).contiguous()
         k = k.transpose(1, 2).contiguous()
         v = v.transpose(1, 2).contiguous()
-        return lambda: facute.flash_attn_func(q, k, v, self.sm_scale, self.causal)
+        return lambda: facute_flash_attn_func(
+            q, k, v, softmax_scale=self.sm_scale, causal=self.causal
+        )
 
     @register_benchmark()
     def flex_attention(self, q, k, v):
@@ -372,7 +380,14 @@ def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
     def get_input_iter(self) -> Generator:
         if self.input_types == "CUSTOMIZED_SHAPES":
             return customized_inputs(
-                shape=(self.BATCH, self.H, self.SEQ_LEN, self.SEQ_LEN_KV, self.D_HEAD),
+                shape=(
+                    self.BATCH,
+                    self.H,
+                    self.N_HEAD_KV,
+                    self.SEQ_LEN,
+                    self.SEQ_LEN_KV,
+                    self.D_HEAD,
+                ),
                 num_inputs=self.tb_args.num_inputs,
                 dtype=self.dtype,
                 device=self.device,
@@ -386,9 +401,9 @@ def get_input_iter(self) -> Generator:
         else:
             raise AssertionError(f"Unknown input type {self.input_types}")
 
-    @register_x_val(label="(Batch, Heads, SeqLen, SeqLen_KV, Dhead)")
+    @register_x_val(label="(Batch, Heads, Heads_KV, SeqLen, SeqLen_KV, Dhead)")
     def get_x_val(self, example_inputs) -> float:
         q, k, v = example_inputs
         B, H, S, D = q.shape
-        _, _, S_KV, _ = k.shape
-        return (B, H, S, S_KV, D)
+        _, H_KV, S_KV, _ = k.shape
+        return (B, H, H_KV, S, S_KV, D)