From 90eda66b952b9f4fe24562861bd4287555576590 Mon Sep 17 00:00:00 2001
From: Neil Dhar <neildhar@fb.com>
Date: Wed, 8 Oct 2025 22:53:24 -0700
Subject: [PATCH] Enable VECT_MUL for blackwell attention

Enabling `VECT_MUL` previously caused a regression, but this seems to
come from the fma vectorisation in particular. Enable it just for
multiplication for now, which seems to be a performance win.
---
 tritonbench/kernels/blackwell_triton_fused_attention_dp.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tritonbench/kernels/blackwell_triton_fused_attention_dp.py b/tritonbench/kernels/blackwell_triton_fused_attention_dp.py
index ab13b3dc..3b7ac72d 100644
--- a/tritonbench/kernels/blackwell_triton_fused_attention_dp.py
+++ b/tritonbench/kernels/blackwell_triton_fused_attention_dp.py
@@ -72,7 +72,8 @@ def _attn_fwd_subtile(
         qk -= m_ij[:, None]
     else:
         m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
-        if VECT_MUL:
+        # TODO: Figure out why vector FMA slows things down.
+        if VECT_MUL and False:
             qk = _fma_f32x2(qk, qk_scale, -m_ij[:, None])
         else:
             qk = qk * qk_scale - m_ij[:, None]
@@ -262,7 +263,7 @@ def make_tile_config(BM, BN, occ, subtile, vectmul, add2reduce):
         for BN in [64, 128]
         for occ in [1, 2]
         for subtile in [True]
-        for vectmul in [False]
+        for vectmul in [True]
         for add2reduce in [False]
     ]
 else: