[WS][Hopper] Enable Warpspec for FA with non-causal and on-device TMA (#7658)

manman-ren · web-flow · commit 7b618d902cd6 · 2025-07-28T10:16:56.000-07:00
Hopper WS doesn't support data partitioning with on-host TMA. Also
subtiling causes issues with our data partitioning. We can try explicit
data partitioning in the kernel.
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -40,13 +40,17 @@ def is_blackwell():
     return is_cuda() and torch.cuda.get_device_capability()[0] == 10
 
 
+def is_hopper():
+    return is_cuda() and torch.cuda.get_device_capability()[0] == 9
+
+
 @triton.jit
 def _attn_fwd_inner(acc, l_i, m_i, q,  #
                     desc_k, desc_v,  #
                     offset_y, dtype: tl.constexpr, start_m, qk_scale,  #
                     BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr, BLOCK_N: tl.constexpr,  #
                     STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr,  #
-                    N_CTX: tl.constexpr, warp_specialize: tl.constexpr):
+                    N_CTX: tl.constexpr, warp_specialize: tl.constexpr, SUBTILE_EPILOGUE: tl.constexpr):
     # range of values handled by this stage
     if STAGE == 1:
         lo, hi = 0, start_m * BLOCK_M
@@ -80,7 +84,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         alpha = tl.math.exp2(m_i - m_ij)
         l_ij = tl.sum(p, 1)
         # -- update output accumulator --
-        if warp_specialize and BLOCK_M == 128 and HEAD_DIM == 128:
+        if SUBTILE_EPILOGUE:
             BM: tl.constexpr = acc.shape[0]
             BN: tl.constexpr = acc.shape[1]
             acc0, acc1 = acc.reshape([BM, 2, BN // 2]).permute(0, 2, 1).split()
@@ -175,6 +179,7 @@ def _attn_fwd(sm_scale, M,  #
               FP8_OUTPUT: tl.constexpr,  #
               STAGE: tl.constexpr,  #
               warp_specialize: tl.constexpr,  #
+              SUBTILE_EPILOGUE: tl.constexpr,  #
               ):
     dtype = tl.float8e5 if FP8_OUTPUT else tl.float16
     tl.static_assert(BLOCK_N <= HEAD_DIM)
@@ -220,15 +225,15 @@ def _attn_fwd(sm_scale, M,  #
                                         offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, HEAD_DIM, BLOCK_N,  #
                                         4 - STAGE, offs_m, offs_n, N_CTX,  #
-                                        warp_specialize)
+                                        warp_specialize, SUBTILE_EPILOGUE)
     # stage 2: on-band
     if STAGE & 2:
         acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q,  #
                                         desc_k, desc_v,  #
                                         offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, HEAD_DIM, BLOCK_N,  #
                                         2, offs_m, offs_n, N_CTX,  #
-                                        warp_specialize)
+                                        warp_specialize, SUBTILE_EPILOGUE)
     # epilogue
     m_i += tl.math.log2(l_i)
     acc = acc / l_i[:, None]
@@ -504,7 +509,8 @@ def forward(ctx, q, k, v, causal, sm_scale, warp_specialize=True):
             extra_kern_args = {"waves_per_eu": waves_per_eu, "allow_flush_denorm": True}
 
         M = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        if supports_host_descriptor():
+        # Use device_descriptor for Hopper + warpspec.
+        if supports_host_descriptor() and not (is_hopper() and warp_specialize):
             # Note that on Hopper we cannot perform a FP8 dot with a non-transposed second tensor
             y_dim = q.shape[0] * q.shape[1] * q.shape[2]
 
@@ -533,7 +539,8 @@ def grid(META):
             return (triton.cdiv(q.shape[2], META["BLOCK_M"]), q.shape[0] * q.shape[1], 1)
 
         ctx.grid = grid
-        if is_cuda() and warp_specialize:
+        SUBTILE_EPILOGUE = False if is_hopper() and warp_specialize else True
+        if is_blackwell() and warp_specialize:
             if HEAD_DIM_K == 128 and q.dtype == torch.float16:
                 extra_kern_args["maxnreg"] = 168
             else:
@@ -547,7 +554,7 @@ def grid(META):
             FP8_OUTPUT=q.dtype == torch.float8_e5m2,  #
             STAGE=stage,  #
             warp_specialize=warp_specialize,  #
-            **extra_kern_args)
+            SUBTILE_EPILOGUE=SUBTILE_EPILOGUE, **extra_kern_args)
 
         ctx.save_for_backward(q, k, v, o, M)
         ctx.sm_scale = sm_scale
@@ -684,7 +691,9 @@ def test_op(Z, H, N_CTX, HEAD_DIM, causal, warp_specialize, mode, provider, dtyp
 for HEAD_DIM in [64, 128]:
     for mode in ["fwd", "bwd"]:
         for causal in [True, False]:
-            for warp_specialize in [False, True] if is_blackwell() else [False]:
+            # Enable warpspec for causal fwd on Hopper
+            for warp_specialize in [False, True] if (is_blackwell() or
+                                                     (is_hopper() and mode == "fwd" and not causal)) else [False]:
                 configs.append(
                     triton.testing.Benchmark(
                         x_names=["N_CTX"],