Add TLX attention (WS pipelined pingpong hopper)

yf225 · yf225 · commit f6d16a1cd029 · 2025-07-30T20:41:29.000-07:00
diff --git a/tritonbench/operators/flash_attention/operator.py b/tritonbench/operators/flash_attention/operator.py
@@ -61,6 +61,10 @@
 from tritonbench.utils.path_utils import add_ld_library_path
 from tritonbench.utils.triton_op import is_fbcode
 
+from .tlx_attn_ws_pipelined_pingpong_hopper import (
+    attention as tlx_attn_ws_pipelined_pingpong_hopper,
+)
+
 
 # [Optional] flash_attn v2
 try:
@@ -136,6 +140,14 @@
 except (ImportError, IOError, AttributeError, TypeError):
     HAS_XFORMERS = False
 
+# [Optional] TLX backend
+try:
+    import triton.language.extra.tlx as tlx
+
+    HAS_TLX = True
+except (ImportError, IOError, AttributeError):
+    HAS_TLX = False
+
 from typing import Any, Generator, List
 
 from tritonbench.utils.input import input_filter
@@ -299,6 +311,18 @@ def triton_tutorial_flash_v2_tma(
             q, k, v, self.causal, self.sm_scale, "tma"
         )
 
+    @register_benchmark(enabled=HAS_TLX)
+    def tlx_attn_ws_pipelined_pingpong_hopper(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+    ) -> Callable:
+        # TLX flash attention with Hopper optimizations
+        return lambda: tlx_attn_ws_pipelined_pingpong_hopper(
+            q, k, v, self.sm_scale
+        )
+
     def xformers_preprocess(
         self,
         q: torch.Tensor,
@@ -341,7 +365,7 @@ def xformers_splitk(
             fhma_input, needs_gradient=need_gradient
         )
 
-    @register_benchmark(enabled=False, label=f"cudnn-{torch.backends.cudnn.version()}")
+    @register_benchmark(enabled=False)  # , label=f"cudnn-{torch.backends.cudnn.version()}")
     def cudnn(self, q, k, v):
         os.environ["TORCH_CUDNN_SDPA_ENABLED"] = "1"
 
diff --git a/tritonbench/operators/flash_attention/tlx_attn_ws_pipelined_pingpong_hopper.py b/tritonbench/operators/flash_attention/tlx_attn_ws_pipelined_pingpong_hopper.py
@@ -0,0 +1,312 @@
+import torch
+
+import triton
+import triton.language as tl
+import triton.language.extra.tlx as tlx
+from triton.tools.tensor_descriptor import TensorDescriptor
+
+DEVICE = triton.runtime.driver.active.get_active_torch_device()
+
+
+def _host_descriptor_pre_hook(nargs):
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    HEAD_DIM = nargs["HEAD_DIM"]
+    if not isinstance(nargs["desc_q"], TensorDescriptor):
+        return
+    HEAD_DIM = nargs["HEAD_DIM"]
+    NUM_MMA_GROUPS = nargs["NUM_MMA_GROUPS"]
+    BLOCK_M_SPLIT = BLOCK_M // NUM_MMA_GROUPS
+    nargs["desc_q"].block_shape = [BLOCK_M_SPLIT, HEAD_DIM]
+    if nargs["FP8_OUTPUT"]:
+        nargs["desc_v"].block_shape = [HEAD_DIM, BLOCK_N]
+    else:
+        nargs["desc_v"].block_shape = [BLOCK_N, HEAD_DIM]
+    nargs["desc_k"].block_shape = [BLOCK_N, HEAD_DIM]
+    nargs["desc_o"].block_shape = [BLOCK_M_SPLIT, HEAD_DIM]
+
+
+configs = [
+    triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'NUM_BUFFERS': 2, 'NUM_MMA_WARPS': 8, 'NUM_MMA_GROUPS': 2},
+                  num_stages=0, num_warps=4, pre_hook=_host_descriptor_pre_hook),
+]
+
+
+@triton.autotune(configs=configs, key=["N_CTX", "HEAD_DIM", "FP8_OUTPUT"])
+@triton.jit
+def _attn_fwd_ws_pipelined_pingpong(sm_scale, M,  #
+              Z, H, desc_q, desc_k, desc_v, desc_o, N_CTX,  #
+              HEAD_DIM: tl.constexpr,  #
+              BLOCK_M: tl.constexpr,  #
+              BLOCK_N: tl.constexpr,  #
+              FP8_OUTPUT: tl.constexpr,  #
+              NUM_BUFFERS: tl.constexpr,  #
+              NUM_MMA_WARPS: tl.constexpr,  #
+              NUM_MMA_GROUPS: tl.constexpr,  #
+              ):
+    tl.static_assert(BLOCK_N <= HEAD_DIM)
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+
+    # allocate buffers
+    q_tiles = tlx.local_alloc((BLOCK_M_SPLIT, HEAD_DIM), tlx.dtype_of(desc_q), NUM_MMA_GROUPS)
+    k_tiles = tlx.local_alloc((BLOCK_N, HEAD_DIM), tlx.dtype_of(desc_k), NUM_BUFFERS)
+    v_tiles = tlx.local_alloc((BLOCK_N, HEAD_DIM), tlx.dtype_of(desc_v), NUM_BUFFERS)
+
+    # allocate barriers
+    q_fulls = tlx.alloc_barriers(num_barriers=NUM_MMA_GROUPS, arrive_count=1)
+    k_empties = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS)
+    k_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+    v_empties = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS)
+    v_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # producer group
+        with tlx.async_task("default"):
+            # initialize offsets
+            start_m = tl.program_id(0)
+            off_hz = tl.program_id(1)
+            off_z = off_hz // H
+            off_h = off_hz % H
+            offset_y = off_z * (N_CTX * H) + off_h * N_CTX
+            qo_offset_y = offset_y + start_m * BLOCK_M
+            lo, hi = 0, N_CTX
+            kv_offset_y = offset_y + lo
+
+            # load q: it will stay in SRAM throughout
+            for cid in tl.range(0, NUM_MMA_GROUPS, loop_unroll_factor=NUM_MMA_GROUPS):
+                q_full = tlx.local_view(q_fulls, cid)
+                tlx.barrier_expect_bytes(q_full, 2 * BLOCK_M_SPLIT * HEAD_DIM)  # float16
+                q_tile = tlx.local_view(q_tiles, cid)
+                qo_offset_y_split = qo_offset_y + cid * BLOCK_M_SPLIT
+                tlx.async_descriptor_load(desc_q, q_tile, [qo_offset_y_split, 0], q_full)
+
+            # loop over loading k, v
+            kv_phase = 0
+            acc_cnt = 0
+            for _ in tl.range(lo, hi, BLOCK_N):
+                buf_id = acc_cnt % NUM_BUFFERS
+                # buffers in a row share the same phase
+                kv_phase = kv_phase ^ (buf_id == 0)
+
+                # wait for the K buffer to be released by the consumer
+                k_empty = tlx.local_view(k_empties, buf_id)
+                tlx.barrier_wait(k_empty, kv_phase)
+                # load K
+                k_full = tlx.local_view(k_fulls, buf_id)
+                k_tile = tlx.local_view(k_tiles, buf_id)
+                tlx.barrier_expect_bytes(k_full, 2 * BLOCK_N * HEAD_DIM)  # float16
+                tlx.async_descriptor_load(desc_k, k_tile, [kv_offset_y, 0], k_full)
+
+                # wait for the V buffer to be released by the consumer
+                v_empty = tlx.local_view(v_empties, buf_id)
+                tlx.barrier_wait(v_empty, kv_phase)
+                # load V
+                v_full = tlx.local_view(v_fulls, buf_id)
+                v_tile = tlx.local_view(v_tiles, buf_id)
+                tlx.barrier_expect_bytes(v_full, 2 * BLOCK_N * HEAD_DIM)  # float16
+                tlx.async_descriptor_load(desc_v, v_tile, [kv_offset_y, 0], v_full)
+
+                kv_offset_y += BLOCK_N
+                acc_cnt += 1
+
+        # consumer group
+        with tlx.async_task(num_warps=NUM_MMA_WARPS // NUM_MMA_GROUPS, registers=232, replicate=NUM_MMA_GROUPS):
+            # initialize pointer to m and l
+            m_i = tl.zeros([BLOCK_M_SPLIT], dtype=tl.float32) - float("inf")
+            l_i = tl.zeros([BLOCK_M_SPLIT], dtype=tl.float32) + 1.0
+            acc = tl.zeros([BLOCK_M_SPLIT, HEAD_DIM], dtype=tl.float32)
+
+            # load scales
+            qk_scale = sm_scale
+            qk_scale *= 1.44269504  # 1/log(2)
+
+            # wait for the Q buffer to be populated by the producer
+            cid: tl.constexpr = tlx.async_task_replica_id()
+            q_full = tlx.local_view(q_fulls, cid)
+            tlx.barrier_wait(q_full, 0)
+            q_tile = tlx.local_view(q_tiles, cid)
+
+            lo, hi = 0, N_CTX
+            k_phase = 0
+            v_phase = 1
+            k_buf_id = 0
+            v_buf_id = 0
+
+            # wait for the K[0] buffer to be populated by the producer
+            k_full = tlx.local_view(k_fulls, k_buf_id)
+            tlx.barrier_wait(k_full, k_phase)
+            k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+            # -- compute qk[0] ----
+            k_tile = tlx.local_trans(k_tile)
+
+            if cid == 0:
+                # Consumer 0 waits for Consumer 1 to reach synchronization point at barrier 9.
+                tlx.named_barrier_wait(9, 256)
+            else:
+                # Consumer 1 signals its arrival at barrier 9.
+                tlx.named_barrier_arrive(9, 256)
+                # Then waits at barrier 10 until Consumer 0 finishes issuing its async_dot.
+                tlx.named_barrier_wait(10, 256)
+
+            qk = tlx.async_dot(q_tile, k_tile)
+
+            if cid == 0:
+                # After issuing async_dot, Consumer 0 signals barrier 10 to unblock Consumer 1.
+                tlx.named_barrier_arrive(10, 256)
+
+            # wait for the MMA using to complete
+            qk = tlx.async_dot_wait(0, qk)
+            # release the K buffer
+            k_empty = tlx.local_view(k_empties, k_buf_id)
+            tlx.barrier_arrive(k_empty, 1)
+
+            # -- compute m_i and l_i ----
+            m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
+            qk = qk * qk_scale - m_ij[:, None]
+            p = tl.math.exp2(qk)
+            # -- compute correction factor
+            alpha = tl.math.exp2(m_i - m_ij)
+            # -- update output accumulator[0] --
+            acc = acc * alpha[:, None]
+            l_ij = tl.sum(p, 1)
+            l_i = l_i * alpha + l_ij
+            m_i = m_ij
+            acc_cnt = 1
+
+
+            # loop over k, v and update accumulator
+            for _ in tl.range(lo + BLOCK_N, hi, BLOCK_N):
+                k_buf_id = acc_cnt % NUM_BUFFERS
+                # buffers in a row share the same phase
+                k_phase = k_phase ^ (k_buf_id == 0)
+
+                # wait for the K buffer to be populated by the producer
+                k_full = tlx.local_view(k_fulls, k_buf_id)
+                tlx.barrier_wait(k_full, k_phase)
+                k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+                # compute qk for the current iteration
+                k_tile = tlx.local_trans(k_tile)
+                qk = tlx.async_dot(q_tile, k_tile)
+
+                # compute pv from the previous iteration
+                # wait for the previous V buffer to be populated by the producer
+                v_buf_id = (acc_cnt - 1) % NUM_BUFFERS
+                v_phase = v_phase ^ (v_buf_id == 0)
+                v_full = tlx.local_view(v_fulls, v_buf_id)
+                tlx.barrier_wait(v_full, v_phase)
+                v_tile = tlx.local_view(v_tiles, v_buf_id)
+                # prepare p and v for the dot
+                p = p.to(tlx.dtype_of(desc_k))
+                acc = tlx.async_dot(p, v_tile, acc)
+
+                # wait for the current qk MMA to complete
+                qk = tlx.async_dot_wait(1, qk)
+                # release the K buffer
+                k_empty = tlx.local_view(k_empties, k_buf_id)
+                tlx.barrier_arrive(k_empty, 1)
+
+                # -- compute m_i and l_i ----
+                m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
+                qk = qk * qk_scale - m_ij[:, None]
+                p = tl.math.exp2(qk)
+                # -- compute correction factor
+                alpha = tl.math.exp2(m_i - m_ij)
+                l_ij = tl.sum(p, 1)
+                # update m_i and l_i
+                l_i = l_i * alpha + l_ij
+                m_i = m_ij
+
+                # -- update output accumulator --
+                # wait for the previous pv MMA to complete
+                acc = tlx.async_dot_wait(0, acc)
+                # release the V buffer
+                v_empty = tlx.local_view(v_empties, v_buf_id)
+                tlx.barrier_arrive(v_empty, 1)
+                acc = acc * alpha[:, None]
+                acc_cnt += 1
+
+            # compute pv from the last iteration
+            # wait for the V buffer to be populated by the producer
+            v_buf_id = (acc_cnt - 1) % NUM_BUFFERS
+            v_phase = v_phase ^ (v_buf_id == 0)
+            v_full = tlx.local_view(v_fulls, v_buf_id)
+            tlx.barrier_wait(v_full, v_phase)
+            v_tile = tlx.local_view(v_tiles, v_buf_id)
+            # prepare p and v for the dot
+            p = p.to(tlx.dtype_of(desc_k))
+            acc = tlx.async_dot(p, v_tile, acc)
+            # wait for the MMA using to complete
+            acc = tlx.async_dot_wait(0, acc)
+            # release the V buffer
+            v_empty = tlx.local_view(v_empties, v_buf_id)
+            tlx.barrier_arrive(v_empty, 1)
+
+            # epilogue
+            start_m = tl.program_id(0)
+            off_hz = tl.program_id(1)
+            off_z = off_hz // H
+            off_h = off_hz % H
+            offset_y = off_z * (N_CTX * H) + off_h * N_CTX
+            qo_offset_y = offset_y + start_m * BLOCK_M
+            qo_offset_y_split = qo_offset_y + cid * BLOCK_M_SPLIT
+            m_i += tl.math.log2(l_i)
+            acc = acc / l_i[:, None]
+            offs_m = start_m * BLOCK_M + cid * BLOCK_M_SPLIT + tl.arange(0, BLOCK_M_SPLIT)
+            m_ptrs = M + off_hz * N_CTX + offs_m
+            tl.store(m_ptrs, m_i)
+            desc_o.store([qo_offset_y_split, 0], acc.to(tlx.dtype_of(desc_o)))
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, sm_scale):
+        # shape constraints
+        HEAD_DIM_Q, HEAD_DIM_K = q.shape[-1], k.shape[-1]
+        # when v is in float8_e5m2 it is transposed.
+        HEAD_DIM_V = v.shape[-1]
+        assert HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V
+        assert HEAD_DIM_K in {16, 32, 64, 128, 256}
+        o = torch.empty_like(q)
+        extra_kern_args = {}
+
+        M = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        # Note that on Hopper we cannot perform a FP8 dot with a non-transposed second tensor
+        y_dim = q.shape[0] * q.shape[1] * q.shape[2]
+
+        dummy_block = [1, 1]
+        desc_q = TensorDescriptor(q, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+        if q.dtype == torch.float8_e5m2:
+            desc_v = TensorDescriptor(v, shape=[HEAD_DIM_K, y_dim], strides=[q.shape[2], 1], block_shape=dummy_block)
+        else:
+            desc_v = TensorDescriptor(v, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+        desc_k = TensorDescriptor(k, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+        desc_o = TensorDescriptor(o, shape=[y_dim, HEAD_DIM_K], strides=[HEAD_DIM_K, 1], block_shape=dummy_block)
+
+        def alloc_fn(size: int, align: int, _):
+            return torch.empty(size, dtype=torch.int8, device="cuda")
+
+        triton.set_allocator(alloc_fn)
+
+        def grid(META):
+            return (triton.cdiv(q.shape[2], META["BLOCK_M"]), q.shape[0] * q.shape[1], 1)
+
+        ctx.grid = grid
+        _attn_fwd_ws_pipelined_pingpong[grid](
+            sm_scale, M,  #
+            q.shape[0], q.shape[1],  #
+            desc_q, desc_k, desc_v, desc_o,  #
+            N_CTX=q.shape[2],  #
+            HEAD_DIM=HEAD_DIM_K,  #
+            FP8_OUTPUT=q.dtype == torch.float8_e5m2,  #
+            **extra_kern_args)
+
+        ctx.save_for_backward(q, k, v, o, M)
+        ctx.sm_scale = sm_scale
+        ctx.HEAD_DIM = HEAD_DIM_K
+        return o
+
+
+attention = _attention.apply