[FlashAttention] Sync from upstream tensor desc implementation (part 3) (#4520)

whitneywhtsang · web-flow · commit 33a25b48cb43 · 2025-08-08T08:44:34.000-04:00
No geomean regression <img width="619" height="318" alt="Screenshot 2025-08-07 230608" src="https://github.com/user-attachments/assets/c1f2ac2e-6c47-4238-9086-52dcb337ceae" /> Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py
@@ -33,8 +33,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         k = desc_k.load([0, offsetk_y])
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
+        qk = tl.dot(q, k)
         if STAGE == 2:
             mask = offs_m[:, None] >= (start_n + offs_n[None, :])
             qk = qk * qk_scale + tl.where(mask, 0, -1.0e6)
@@ -47,7 +46,6 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         # -- compute correction factor
         alpha = tl.math.exp2(m_i - m_ij)
         l_ij = tl.sum(p, 1)
-        l_i = l_i * alpha + l_ij
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # prepare p and v for the dot
@@ -57,6 +55,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
         # place this at the end of the loop to reduce register pressure
+        l_i = l_i * alpha + l_ij
         m_i = m_ij
         offsetk_y += BLOCK_N
         offsetv_y += BLOCK_N