Call _memory_efficient_attention for bwd of cutlass blackwell fmha as well

henrylhtsang · web-flow · commit a0c6d82d06b4 · 2025-09-22T13:41:19.000-07:00
Differential Revision: D82490887 Pull Request resolved: meta-pytorch#423
diff --git a/tritonbench/operators/blackwell_attentions/operator.py b/tritonbench/operators/blackwell_attentions/operator.py
@@ -67,6 +67,7 @@
 try:
     import xformers  # @manual=//fair/xformers:xformers
     import xformers.ops.fmha as xformers_fmha  # @manual=//fair/xformers:xformers
+    from xformers.ops.fmha import MemoryEfficientAttentionCutlassBlackwellOp
 
     from ..flash_attention.test_fmha_utils import permute_qkv
 
@@ -316,11 +317,11 @@ def cutlass_blackwell(
         k: torch.Tensor,
         v: torch.Tensor,
     ) -> Callable:
-        need_gradient = not (self.mode == BenchmarkMode.FWD_NO_GRAD)
         fhma_input = self.xformers_preprocess(q, k, v)
-        xformers_cutlass_fhma = xformers.ops.fmha.cutlass_blackwell.FwOp
-        return lambda: xformers_cutlass_fhma().apply(
-            fhma_input, needs_gradient=need_gradient
+
+        return lambda: xformers.ops.fmha._memory_efficient_attention(
+            fhma_input,
+            op=MemoryEfficientAttentionCutlassBlackwellOp,
         )
 
     @register_benchmark(enabled=HAS_XFORMERS, fwd_only=True)