[Blackwell_Attention] [Triton] Make N_CTX const in DP FA kernel

njriasan · web-flow · commit b03df33617cb · 2025-10-09T13:00:45.000-04:00
diff --git a/tritonbench/kernels/attention_utils.py b/tritonbench/kernels/attention_utils.py
@@ -16,7 +16,7 @@
 PEEL_LAST = os.getenv("PEEL_LAST_ITER")
 WITH_TMA = os.getenv("WITH_TMA")
 HAS_EXPLICIT_WS = os.getenv("ENABLE_EXPLICIT_WS")
-SUPPORT_GLUON = os.getenv("WITH_GLUON")
+SUPPORT_GLUON = os.getenv("WITH_GLUON") == "1"
 WITH_MAXNREG = os.getenv("WITH_MAXNREG")
 
 
diff --git a/tritonbench/kernels/blackwell_triton_fused_attention_dp.py b/tritonbench/kernels/blackwell_triton_fused_attention_dp.py
@@ -291,7 +291,7 @@ def make_standard_config(BM, BN, s, w, subtile, vectmul, add2reduce):
     configs = [
         make_standard_config(BM, BN, s, w, subtile, vectmul, add2reduce)
         for BM in [256]
-        for BN in [128]
+        for BN in [64, 128]
         for s in NUM_STAGES_OPTIONS
         for w in [4]
         for subtile in [True]
@@ -318,6 +318,13 @@ def prune_invalid_configs(configs, named_args, **kwargs):
     return [conf for conf in configs if conf.kwargs.get("BLOCK_M", 0) <= N_CTX]
 
 
+def prune_persistent_configs(configs, named_args, **kwargs):
+    N_CTX = kwargs["N_CTX"]
+    # Filter out configs based on desired BLOCK_n
+    TARGET_BLOCK_N = 64 if N_CTX == 128 else 128
+    return [conf for conf in configs if conf.kwargs.get("BLOCK_N", 0) == TARGET_BLOCK_N]
+
+
 @triton.jit
 def _maybe_make_tensor_desc(desc_or_ptr, shape, strides, block_shape):
     if isinstance(desc_or_ptr, tl.tensor_descriptor):
@@ -399,7 +406,7 @@ def _attn_fwd_tma_dp(
     desc_o,
     pid,
     off_hz,
-    N_CTX,  #
+    N_CTX: tl.constexpr,  #
     HEAD_DIM: tl.constexpr,  #
     BLOCK_M: tl.constexpr,  #
     BLOCK_N: tl.constexpr,  #
@@ -543,7 +550,7 @@ def _attn_fwd(
     desc_k,
     desc_v,
     desc_o,
-    N_CTX,  #
+    N_CTX: tl.constexpr,  #
     HEAD_DIM: tl.constexpr,  #
     BLOCK_M: tl.constexpr,  #
     BLOCK_N: tl.constexpr,  #
@@ -585,7 +592,7 @@ def _attn_fwd(
 @triton.autotune(
     configs=list(filter(keep, configs)),
     key=["N_CTX", "HEAD_DIM", "FP8_OUTPUT", "warp_specialize"],
-    prune_configs_by={"early_config_prune": prune_invalid_configs},
+    prune_configs_by={"early_config_prune": prune_persistent_configs},
 )
 @triton.jit
 def _attn_fwd_persist(
@@ -597,7 +604,7 @@ def _attn_fwd_persist(
     desc_k,
     desc_v,
     desc_o,
-    N_CTX,  #: tl.constexpr,  #
+    N_CTX: tl.constexpr,  #
     HEAD_DIM: tl.constexpr,  #
     BLOCK_M: tl.constexpr,  #
     BLOCK_N: tl.constexpr,  #
diff --git a/tritonbench/operators/blackwell_attentions/operator.py b/tritonbench/operators/blackwell_attentions/operator.py
@@ -28,7 +28,7 @@
     )
 
     HAS_BLACKWELL_AUTOWS = True
-except (ImportError, IOError, AttributeError):
+except (ImportError, IOError, AttributeError, TypeError):
     # Needs compiler that supports autoWS
     HAS_BLACKWELL_AUTOWS = False
 
@@ -492,7 +492,7 @@ def gluon_blackwell_tutorial_fwd(
         return lambda: gluon_blackwell_fwd(q, k, v, self.causal, self.sm_scale)
 
     # Only works with triton main, forward only.
-    @register_benchmark(enabled=False)
+    @register_benchmark(enabled=SUPPORT_GLUON)
     def gluon_blackwell_tutorial_persistent_fwd(
         self,
         q: torch.Tensor,