[FlashAttention] Sync from upstream tensor desc implementation (part 1) (#4467)

whitneywhtsang · web-flow · commit 35f200550927 · 2025-06-10T00:13:01.000-04:00
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_tensor_desc_benchmark.py
@@ -6,12 +6,14 @@
 
 from triton_kernels_benchmark import flash_attention_benchmark
 
+# FIXME: Revert temporary source code modification done in last commit of PR #4399.
+
 
 # pylint: disable=unused-argument
 @triton.jit
 def _attn_fwd_inner(acc, l_i, m_i, q,  #
-                    K_desc, V_desc,  #
-                    start_m, qk_scale,  #
+                    desc_k, desc_v,  #
+                    offset_y, dtype: tl.constexpr, start_m, qk_scale,  #
                     BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,  #
                     STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr,  #
                     N_CTX: tl.constexpr):
@@ -24,13 +26,13 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
     # causal = False
     else:
         lo, hi = 0, N_CTX
-    off_k = lo
-    off_v = lo
+    offsetk_y = offset_y + lo
+    offsetv_y = offset_y + lo
     # loop over k, v and update accumulator
     for start_n in range(lo, hi, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
-        k = K_desc.load([0, off_k])
+        k = desc_k.load([0, offsetk_y])
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
         if STAGE == 2:
@@ -43,18 +45,20 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
             qk = qk * qk_scale - m_ij[:, None]
         p = tl.math.exp2(qk)
         l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
+        # -- compute correction factor
         alpha = tl.math.exp2(m_i - m_ij)
         l_i = l_i * alpha + l_ij
         # -- update output accumulator --
         acc = acc * alpha[:, None]
-        # update acc
-        v = V_desc.load([off_v, 0])
+        # prepare p and v for the dot
+        v = desc_v.load([offsetv_y, 0])
+        # note that this non transposed v for FP8 is only supported on Blackwell
         acc += tl.dot(p.to(tl.float16), v)
         # update m_i and l_i
+        # place this at the end of the loop to reduce register pressure
         m_i = m_ij
-        off_v += BLOCK_N
-        off_k += BLOCK_N
+        offsetk_y += BLOCK_N
+        offsetv_y += BLOCK_N
     return acc, l_i, m_i
 
 
@@ -75,25 +79,28 @@ def _attn_fwd_with_tensor_desc(Q, K, V, sm_scale, M, Out,  #
                                BLOCK_N: tl.constexpr,  #
                                STAGE: tl.constexpr  #
                                ):  # pylint: disable=unused-argument
-
+    dtype = tl.float16
+    tl.static_assert(BLOCK_N <= BLOCK_DMODEL)
     start_m = tl.program_id(2)
     off_z = tl.program_id(0)
     off_h = tl.program_id(1)
-    qvk_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
+    offset_y = off_z * (N_CTX * H) + off_h * N_CTX
     if N_CTX <= 512:
         start_m = tl.program_id(0)
         off_z = tl.program_id(2)
-        qvk_offset = off_z.to(tl.int64) * stride_qh
+        offset_y = off_z * N_CTX
+
+    y_dim = Z * H * N_CTX
+    desc_q = tl.make_tensor_descriptor(Q, shape=[y_dim, BLOCK_DMODEL], strides=[BLOCK_DMODEL, 1],
+                                       block_shape=[BLOCK_M, BLOCK_DMODEL])
+    desc_v = tl.make_tensor_descriptor(V, shape=[y_dim, BLOCK_DMODEL], strides=[BLOCK_DMODEL, 1],
+                                       block_shape=[BLOCK_N, BLOCK_DMODEL])
+    desc_k = tl.make_tensor_descriptor(K, shape=[BLOCK_DMODEL, y_dim], strides=[1, BLOCK_DMODEL],
+                                       block_shape=[BLOCK_DMODEL, BLOCK_N])
+    desc_o = tl.make_tensor_descriptor(Out, shape=[y_dim, BLOCK_DMODEL], strides=[BLOCK_DMODEL, 1],
+                                       block_shape=[BLOCK_M, BLOCK_DMODEL])
 
-    # tensor descriptors
-    Q_desc = tl.make_tensor_descriptor(base=Q + qvk_offset, shape=(N_CTX, BLOCK_DMODEL), strides=(stride_qm, stride_qk),
-                                       block_shape=(BLOCK_M, BLOCK_DMODEL))
-    V_desc = tl.make_tensor_descriptor(base=V + qvk_offset, shape=(N_CTX, BLOCK_DMODEL), strides=(stride_vk, stride_vn),
-                                       block_shape=(BLOCK_N, BLOCK_DMODEL))
-    K_desc = tl.make_tensor_descriptor(base=K + qvk_offset, shape=(BLOCK_DMODEL, N_CTX), strides=(stride_kk, stride_kn),
-                                       block_shape=(BLOCK_DMODEL, BLOCK_N))
-    O_desc = tl.make_tensor_descriptor(base=Out + qvk_offset, shape=(N_CTX, BLOCK_DMODEL),
-                                       strides=(stride_om, stride_on), block_shape=(BLOCK_M, BLOCK_DMODEL))
+    qo_offset_y = offset_y + start_m * BLOCK_M
     # initialize offsets
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N)
@@ -105,27 +112,29 @@ def _attn_fwd_with_tensor_desc(Q, K, V, sm_scale, M, Out,  #
     qk_scale = sm_scale
     qk_scale *= 1.44269504  # 1/log(2)
     # load q: it will stay in SRAM throughout
-    q = Q_desc.load([start_m * BLOCK_M, 0])
+    q = desc_q.load([qo_offset_y, 0])
     # stage 1: off-band
-    # For causal = True, STAGE = 3, the kernel gets 1 as its STAGE
-    # For causal = False, STAGE = 1, the kernel gets 3 as its STAGE
+    # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE
+    # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE
     if STAGE & 1:
-        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_desc, V_desc,  #
-                                        start_m, qk_scale,  #
+        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q,  #
+                                        desc_k, desc_v,  #
+                                        offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, BLOCK_DMODEL, BLOCK_N,  #
                                         4 - STAGE, offs_m, offs_n, N_CTX  #
                                         )
     # stage 2: on-band
     if STAGE & 2:
-        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_desc, V_desc,  #
-                                        start_m, qk_scale,  #
+        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q,  #
+                                        desc_k, desc_v,  #
+                                        offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, BLOCK_DMODEL, BLOCK_N,  #
                                         2, offs_m, offs_n, N_CTX  #
                                         )
     # epilogue
     m_i += tl.math.log2(l_i)
     acc = acc / l_i[:, None]
-    O_desc.store([start_m * BLOCK_M, 0], acc.to(Out.type.element_ty))
+    desc_o.store([qo_offset_y, 0], acc.to(Out.type.element_ty))
 
 
 def get_benchmark(