[Tutorial] Fix subtile flags for blackwell (#7679)

Mogball · web-flow · commit 2a9e85ea2c8c · 2025-07-28T11:26:23.000-07:00
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -50,7 +50,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
                     offset_y, dtype: tl.constexpr, start_m, qk_scale,  #
                     BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr, BLOCK_N: tl.constexpr,  #
                     STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr,  #
-                    N_CTX: tl.constexpr, warp_specialize: tl.constexpr, SUBTILE_EPILOGUE: tl.constexpr):
+                    N_CTX: tl.constexpr, warp_specialize: tl.constexpr, IS_HOPPER: tl.constexpr):
     # range of values handled by this stage
     if STAGE == 1:
         lo, hi = 0, start_m * BLOCK_M
@@ -84,7 +84,7 @@ def _attn_fwd_inner(acc, l_i, m_i, q,  #
         alpha = tl.math.exp2(m_i - m_ij)
         l_ij = tl.sum(p, 1)
         # -- update output accumulator --
-        if SUBTILE_EPILOGUE:
+        if not IS_HOPPER and warp_specialize and BLOCK_M == 128 and HEAD_DIM == 128:
             BM: tl.constexpr = acc.shape[0]
             BN: tl.constexpr = acc.shape[1]
             acc0, acc1 = acc.reshape([BM, 2, BN // 2]).permute(0, 2, 1).split()
@@ -179,7 +179,7 @@ def _attn_fwd(sm_scale, M,  #
               FP8_OUTPUT: tl.constexpr,  #
               STAGE: tl.constexpr,  #
               warp_specialize: tl.constexpr,  #
-              SUBTILE_EPILOGUE: tl.constexpr,  #
+              IS_HOPPER: tl.constexpr,  #
               ):
     dtype = tl.float8e5 if FP8_OUTPUT else tl.float16
     tl.static_assert(BLOCK_N <= HEAD_DIM)
@@ -225,15 +225,15 @@ def _attn_fwd(sm_scale, M,  #
                                         offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, HEAD_DIM, BLOCK_N,  #
                                         4 - STAGE, offs_m, offs_n, N_CTX,  #
-                                        warp_specialize, SUBTILE_EPILOGUE)
+                                        warp_specialize, IS_HOPPER)
     # stage 2: on-band
     if STAGE & 2:
         acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q,  #
                                         desc_k, desc_v,  #
                                         offset_y, dtype, start_m, qk_scale,  #
                                         BLOCK_M, HEAD_DIM, BLOCK_N,  #
                                         2, offs_m, offs_n, N_CTX,  #
-                                        warp_specialize, SUBTILE_EPILOGUE)
+                                        warp_specialize, IS_HOPPER)
     # epilogue
     m_i += tl.math.log2(l_i)
     acc = acc / l_i[:, None]
@@ -539,7 +539,6 @@ def grid(META):
             return (triton.cdiv(q.shape[2], META["BLOCK_M"]), q.shape[0] * q.shape[1], 1)
 
         ctx.grid = grid
-        SUBTILE_EPILOGUE = False if is_hopper() and warp_specialize else True
         if is_blackwell() and warp_specialize:
             if HEAD_DIM_K == 128 and q.dtype == torch.float16:
                 extra_kern_args["maxnreg"] = 168
@@ -554,7 +553,8 @@ def grid(META):
             FP8_OUTPUT=q.dtype == torch.float8_e5m2,  #
             STAGE=stage,  #
             warp_specialize=warp_specialize,  #
-            SUBTILE_EPILOGUE=SUBTILE_EPILOGUE, **extra_kern_args)
+            IS_HOPPER=is_hopper(),  #
+            **extra_kern_args)
 
         ctx.save_for_backward(q, k, v, o, M)
         ctx.sm_scale = sm_scale
@@ -692,8 +692,8 @@ def test_op(Z, H, N_CTX, HEAD_DIM, causal, warp_specialize, mode, provider, dtyp
     for mode in ["fwd", "bwd"]:
         for causal in [True, False]:
             # Enable warpspec for causal fwd on Hopper
-            for warp_specialize in [False, True] if (is_blackwell() or
-                                                     (is_hopper() and mode == "fwd" and not causal)) else [False]:
+            enable_ws = mode == "fwd" and (is_blackwell() or (is_hopper() and not causal))
+            for warp_specialize in [False, True] if enable_ws else [False]:
                 configs.append(
                     triton.testing.Benchmark(
                         x_names=["N_CTX"],