[FlashAttention] Sync from upstream tensor desc implementation (part 2) (#4470)

whitneywhtsang · web-flow · commit 1cdfaf0785f8 · 2025-06-16T20:51:42.000-04:00
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py
@@ -29,7 +29,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
     offsetk_y = offset_y + lo
     offsetv_y = offset_y + lo
     # loop over k, v and update accumulator
-    for start_n in range(lo, hi, BLOCK_N):
+    for start_n in tl.range(lo, hi, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         k = desc_k.load([0, offsetk_y])
@@ -44,16 +44,17 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
             m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
             qk = qk * qk_scale - m_ij[:, None]
         p = tl.math.exp2(qk)
-        l_ij = tl.sum(p, 1)
         # -- compute correction factor
         alpha = tl.math.exp2(m_i - m_ij)
+        l_ij = tl.sum(p, 1)
         l_i = l_i * alpha + l_ij
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # prepare p and v for the dot
         v = desc_v.load([offsetv_y, 0])
+        p = p.to(dtype)
         # note that this non transposed v for FP8 is only supported on Blackwell
-        acc += tl.dot(p.to(tl.float16), v)
+        acc = tl.dot(p, v, acc)
         # update m_i and l_i
         # place this at the end of the loop to reduce register pressure
         m_i = m_ij