From 90eda66b952b9f4fe24562861bd4287555576590 Mon Sep 17 00:00:00 2001 From: Neil Dhar Date: Wed, 8 Oct 2025 22:53:24 -0700 Subject: [PATCH] Enable VECT_MUL for blackwell attention Enabling `VECT_MUL` previously caused a regression, but this seems to come from the fma vectorisation in particular. Enable it just for multiplication for now, which seems to be a performance win. --- tritonbench/kernels/blackwell_triton_fused_attention_dp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tritonbench/kernels/blackwell_triton_fused_attention_dp.py b/tritonbench/kernels/blackwell_triton_fused_attention_dp.py index ab13b3dc..3b7ac72d 100644 --- a/tritonbench/kernels/blackwell_triton_fused_attention_dp.py +++ b/tritonbench/kernels/blackwell_triton_fused_attention_dp.py @@ -72,7 +72,8 @@ def _attn_fwd_subtile( qk -= m_ij[:, None] else: m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale) - if VECT_MUL: + # TODO: Figure out why vector FMA slows things down. + if VECT_MUL and False: qk = _fma_f32x2(qk, qk_scale, -m_ij[:, None]) else: qk = qk * qk_scale - m_ij[:, None] @@ -262,7 +263,7 @@ def make_tile_config(BM, BN, occ, subtile, vectmul, add2reduce): for BN in [64, 128] for occ in [1, 2] for subtile in [True] - for vectmul in [False] + for vectmul in [True] for add2reduce in [False] ] else: