[TLX] Provide a basic implementation of FA causal backwards (#713)

njriasan · meta-codesync[bot] · commit 29b45135a4cd · 2025-12-03T15:25:10.000-08:00
Summary: Adds a basic implementation for backwards. It needs significant optimization. Pull Request resolved: #713 Reviewed By: srivatsan-ramesh, htyu Differential Revision: D88263408 Pulled By: njriasan fbshipit-source-id: 104895a1e944bdf45404e92fca7f22f03227068c
diff --git a/third_party/tlx/tutorials/blackwell-fa-ws-pipelined-persistent_test.py b/third_party/tlx/tutorials/blackwell-fa-ws-pipelined-persistent_test.py
@@ -166,6 +166,20 @@ def _get_unfused_loop_bounds(start_m, N_CTX, BLOCK_M, STAGE: tl.constexpr):
     return lo, hi
 
 
+@triton.jit
+def _get_unfused_bwd_loop_bounds(N_CTX, BLOCK_M, STAGE: tl.constexpr):
+    if STAGE == 1:
+        # First part of STAGE == 3
+        lo, hi = 0, N_CTX
+    elif STAGE == 2:
+        # Second part of STAGE == 3 in this function
+        lo, hi = N_CTX, N_CTX
+    else:
+        tl.static_assert(STAGE == 3)
+        lo, hi = 0, N_CTX
+    return lo, hi
+
+
 @triton.jit
 def _get_fused_loop_bounds(start_m, N_CTX, BLOCK_M, STAGE: tl.constexpr):
     if STAGE == 1:
@@ -867,7 +881,7 @@ def _attn_fwd_ws(sm_scale, M,  #
 @triton.jit
 def _attn_bwd_preprocess(O, DO,  #
                          Delta,  #
-                         Z, H, N_CTX,  #
+                         N_CTX,  #
                          BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr,  #
                          ):
     off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
@@ -1107,6 +1121,77 @@ def _bwd_host_descriptor_pre_hook_tlx(nargs):
 ]
 
 
+# TODO: Unused. Fix layout issue inside TLX.
+@triton.jit
+def _bwd_compute_inner_loop(
+    start_n,
+    qk_fulls,
+    qk_tiles,
+    qk_empties,
+    p_tiles,
+    p_fulls,
+    dp_fulls,
+    dp_tiles,
+    ds_tiles,
+    ds_fulls,
+    M,
+    D,
+    curr_m,
+    blk_idx,
+    step_m,
+    do_out_dtype,
+    q_out_dtype,
+    N_CTX,
+    NUM_BUFFERS_TMEM: tl.constexpr,
+    NUM_BUFFERS_DS: tl.constexpr,
+    BLOCK_M1: tl.constexpr,
+    BLOCK_N1: tl.constexpr,
+    STAGE: tl.constexpr,
+):
+    offs_n = start_n + tl.arange(0, BLOCK_N1)
+    lo, hi = _get_unfused_bwd_loop_bounds(N_CTX, BLOCK_M1, STAGE)
+    num_steps = (hi - lo) // BLOCK_M1
+    for _ in range(num_steps):
+        tmem_buf_id, tmem_phase = _get_bufidx_phase(blk_idx, NUM_BUFFERS_TMEM)
+        ds_buf_id, _ = _get_bufidx_phase(blk_idx, NUM_BUFFERS_DS)
+
+        offs_m = curr_m + tl.arange(0, BLOCK_M1)
+        m = tl.load(M + offs_m)
+
+        # wait for qkT = tl.dot(k, qT)
+        tlx.barrier_wait(tlx.local_view(qk_fulls, tmem_buf_id), tmem_phase)
+        qkT = tlx.local_load(tlx.local_view(qk_tiles, tmem_buf_id))
+        tlx.barrier_arrive(tlx.local_view(qk_empties, tmem_buf_id))
+
+        pT = tl.math.exp2(qkT - m[None, :])
+        if STAGE == 1:
+            mask = offs_m[None, :] >= offs_n[:, None]
+            pT = tl.where(mask, pT, 0.0)
+
+        # ppT *= qk_scale
+        ppT = pT
+        ppT = ppT.to(do_out_dtype)
+        tlx.local_store(tlx.local_view(p_tiles, tmem_buf_id), ppT)
+        tlx.barrier_arrive(tlx.local_view(p_fulls, tmem_buf_id))
+
+        # D (= delta) is pre-divided by ds_scale.
+        Di = tl.load(D + offs_m)
+
+        # Wait for dpT = tl.dot(v, tl.trans(do))
+        tlx.barrier_wait(tlx.local_view(dp_fulls, tmem_buf_id), tmem_phase)
+        dpT = tlx.local_load(tlx.local_view(dp_tiles, tmem_buf_id))
+        # No need to release dP, as dP uses the same tmem as dQ
+        # in the same iteration. Release dQ instead later.
+        dsT = pT * (dpT - Di[None, :])
+        dsT = dsT.to(q_out_dtype)
+        tlx.local_store(tlx.local_view(ds_tiles, ds_buf_id), dsT)
+        tlx.fence_async_shared()
+        tlx.barrier_arrive(tlx.local_view(ds_fulls, ds_buf_id))
+        curr_m += step_m
+        blk_idx += 1
+    return curr_m, blk_idx
+
+
 @triton.autotune(configs=configs_bwd_tlx, key=["N_CTX", "HEAD_DIM"])
 @triton.jit
 def _attn_bwd_ws(
@@ -1139,6 +1224,7 @@ def _attn_bwd_ws(
     NUM_BUFFERS_DS: tl.constexpr,
     NUM_BUFFERS_TMEM: tl.constexpr,
     EPILOGUE_SUBTILE: tl.constexpr,
+    STAGE: tl.constexpr,
 ):
     # allocate smem buffers
     k_tiles = tlx.local_alloc((BLOCK_N1, HEAD_DIM), tlx.dtype_of(desc_k), NUM_BUFFERS_KV)
@@ -1200,8 +1286,15 @@ def _attn_bwd_ws(
     with tlx.async_tasks():
         # reduction
         with tlx.async_task("default"):
-            off_chz, off_bh, start_m, _, num_steps = bwd_caculate_offsets(stride_z, stride_h, stride_tok, H, N_CTX,
-                                                                          BLOCK_M1, BLOCK_N1)
+            off_chz, off_bh, start_m, _, num_steps = bwd_caculate_offsets(
+                stride_z,
+                stride_h,
+                stride_tok,
+                H,
+                N_CTX,
+                BLOCK_M1,
+                BLOCK_N1,
+            )
             curr_m = start_m
             step_m = BLOCK_M1
             for blk_idx in range(num_steps):
@@ -1227,49 +1320,67 @@ def _attn_bwd_ws(
 
         # compute
         with tlx.async_task(num_warps=8, registers=192, replicate=1):
-            off_chz, off_bh, start_m, start_n, num_steps = bwd_caculate_offsets(stride_z, stride_h, stride_tok, H,
-                                                                                N_CTX, BLOCK_M1, BLOCK_N1)
-
+            off_chz, off_bh, _, start_n, _ = bwd_caculate_offsets(
+                stride_z,
+                stride_h,
+                stride_tok,
+                H,
+                N_CTX,
+                BLOCK_M1,
+                BLOCK_N1,
+            )
             # offset pointers for batch/head
             M += off_chz
             D += off_chz
-            curr_m = start_m
+            curr_m = 0
             step_m = BLOCK_M1
-            for blk_idx in range(num_steps):
-                tmem_buf_id, tmem_phase = _get_bufidx_phase(blk_idx, NUM_BUFFERS_TMEM)
-                ds_buf_id, _ = _get_bufidx_phase(blk_idx, NUM_BUFFERS_DS)
-
-                offs_m = curr_m + tl.arange(0, BLOCK_M1)
-                m = tl.load(M + offs_m)
-
-                # wait for qkT = tl.dot(k, qT)
-                tlx.barrier_wait(qk_fulls[tmem_buf_id], tmem_phase)
-                qkT = tlx.local_load(qk_tiles[tmem_buf_id])
-                tlx.barrier_arrive(qk_empties[tmem_buf_id])
-
-                pT = tl.math.exp2(qkT - m[None, :])
-
-                # ppT *= qk_scale
-                ppT = pT
-                ppT = ppT.to(tlx.dtype_of(desc_do))
-                tlx.local_store(p_tiles[tmem_buf_id], ppT)
-                tlx.barrier_arrive(p_fulls[tmem_buf_id])
-
-                # D (= delta) is pre-divided by ds_scale.
-                Di = tl.load(D + offs_m)
-
-                # Wait for dpT = tl.dot(v, tl.trans(do))
-                tlx.barrier_wait(dp_fulls[tmem_buf_id], tmem_phase)
-                dpT = tlx.local_load(dp_tiles[tmem_buf_id])
-                # No need to release dP, as dP uses the same tmem as dQ
-                # in the same iteration. Release dQ instead later.
-                dsT = pT * (dpT - Di[None, :])
-                dsT = dsT.to(tlx.dtype_of(desc_q))
-                tlx.local_store(ds_tiles[ds_buf_id], dsT)
-                tlx.fence_async_shared()
-                tlx.barrier_arrive(ds_fulls[ds_buf_id])
-                curr_m += step_m
-
+            do_out_dtype = tlx.dtype_of(desc_do)
+            q_out_dtype = tlx.dtype_of(desc_q)
+            if STAGE & 1:
+                offs_n = start_n + tl.arange(0, BLOCK_N1)
+                lo, hi = _get_unfused_bwd_loop_bounds(N_CTX, BLOCK_M1, STAGE=4 - STAGE)
+                num_steps = (hi - lo) // BLOCK_M1
+                blk_idx = 0
+                for _ in range(num_steps):
+                    tmem_buf_id, tmem_phase = _get_bufidx_phase(blk_idx, NUM_BUFFERS_TMEM)
+                    ds_buf_id, _ = _get_bufidx_phase(blk_idx, NUM_BUFFERS_DS)
+
+                    offs_m = curr_m + tl.arange(0, BLOCK_M1)
+                    m = tl.load(M + offs_m)
+
+                    # wait for qkT = tl.dot(k, qT)
+                    tlx.barrier_wait(tlx.local_view(qk_fulls, tmem_buf_id), tmem_phase)
+                    qkT = tlx.local_load(tlx.local_view(qk_tiles, tmem_buf_id))
+                    tlx.barrier_arrive(tlx.local_view(qk_empties, tmem_buf_id))
+
+                    pT = tl.math.exp2(qkT - m[None, :])
+                    if STAGE == 3:
+                        mask = offs_m[None, :] >= offs_n[:, None]
+                        pT = tl.where(mask, pT, 0.0)
+
+                    # ppT *= qk_scale
+                    ppT = pT
+                    ppT = ppT.to(do_out_dtype)
+                    tlx.local_store(tlx.local_view(p_tiles, tmem_buf_id), ppT)
+                    tlx.barrier_arrive(tlx.local_view(p_fulls, tmem_buf_id))
+
+                    # D (= delta) is pre-divided by ds_scale.
+                    Di = tl.load(D + offs_m)
+
+                    # Wait for dpT = tl.dot(v, tl.trans(do))
+                    tlx.barrier_wait(tlx.local_view(dp_fulls, tmem_buf_id), tmem_phase)
+                    dpT = tlx.local_load(tlx.local_view(dp_tiles, tmem_buf_id))
+                    # No need to release dP, as dP uses the same tmem as dQ
+                    # in the same iteration. Release dQ instead later.
+                    dsT = pT * (dpT - Di[None, :])
+                    dsT = dsT.to(q_out_dtype)
+                    tlx.local_store(tlx.local_view(ds_tiles, ds_buf_id), dsT)
+                    tlx.fence_async_shared()
+                    tlx.barrier_arrive(tlx.local_view(ds_fulls, ds_buf_id))
+                    curr_m += step_m
+                    blk_idx += 1
+            # TODO: Add the STAGE & 2 handling when we can determine bounds to divide
+            # the work across two loops, based on optimizing out the mask.
             # epilogue
             kv_buf_id, kv_phase = _get_bufidx_phase(0, NUM_BUFFERS_KV)
 
@@ -1303,8 +1414,15 @@ def _attn_bwd_ws(
 
         # mma
         with tlx.async_task(num_warps=1, registers=48):
-            _, _, start_m, _, num_steps = bwd_caculate_offsets(stride_z, stride_h, stride_tok, H, N_CTX, BLOCK_M1,
-                                                               BLOCK_N1)
+            _, _, start_m, _, num_steps = bwd_caculate_offsets(
+                stride_z,
+                stride_h,
+                stride_tok,
+                H,
+                N_CTX,
+                BLOCK_M1,
+                BLOCK_N1,
+            )
 
             kv_buf_id, kv_phase = _get_bufidx_phase(0, NUM_BUFFERS_KV)
             tlx.barrier_wait(k_fulls[kv_buf_id], kv_phase)
@@ -1469,8 +1587,15 @@ def _attn_bwd_ws(
 
         # load
         with tlx.async_task(num_warps=1, registers=88):
-            _, off_bh, start_m, start_n, num_steps = bwd_caculate_offsets(stride_z, stride_h, stride_tok, H, N_CTX,
-                                                                          BLOCK_M1, BLOCK_N1)
+            _, off_bh, start_m, start_n, num_steps = bwd_caculate_offsets(
+                stride_z,
+                stride_h,
+                stride_tok,
+                H,
+                N_CTX,
+                BLOCK_M1,
+                BLOCK_N1,
+            )
             # Load K
             kv_buf_id, _ = _get_bufidx_phase(0, NUM_BUFFERS_KV)
             tlx.barrier_expect_bytes(k_fulls[kv_buf_id], 2 * BLOCK_N1 * HEAD_DIM)  # float16
@@ -1632,6 +1757,7 @@ def grid(META):
         ctx.save_for_backward(q, k, v, o, M)
         ctx.sm_scale = sm_scale
         ctx.HEAD_DIM = HEAD_DIM_K
+        ctx.causal = causal
         return o
 
     @staticmethod
@@ -1655,7 +1781,7 @@ def backward(ctx, do):
         _attn_bwd_preprocess[pre_grid](
             o, do,  #
             delta,  #
-            BATCH, N_HEAD, N_CTX,  #
+            N_CTX,  #
             BLOCK_M=PRE_BLOCK, HEAD_DIM=ctx.HEAD_DIM,  #
         )
 
@@ -1716,13 +1842,16 @@ def grid(meta):
                 BATCH * N_HEAD,
             )  # batch*heads
 
+        stage = 3 if ctx.causal else 1
+
         _attn_bwd_ws[grid](
             desc_q, desc_k, desc_v, ctx.sm_scale, desc_do, desc_dq, desc_dk, desc_dv,  #
             M, delta,  #
             q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
             N_HEAD, N_CTX,  #
             BLK_SLICE_FACTOR=BLK_SLICE_FACTOR,  #
             HEAD_DIM=ctx.HEAD_DIM,  #
+            STAGE=stage,  #
         )
 
         return dq, dk, dv, None, None, None, None
@@ -1750,8 +1879,6 @@ def test_op(Z, H, N_CTX, HEAD_DIM, mode, provider, causal, dtype=torch.float16):
     sm_scale = 0.5
     # reference implementation
     ref_dtype = dtype
-    if mode == "bwd" and causal:
-        pytest.skip("Causal not supported for bwd yet")
     if mode == "fwd" and "fp8" in provider:
         ref_dtype = torch.float32
     q = q.to(ref_dtype)