Remove the if-else block in causal loop (#779)

zhanglx13 · web-flow · commit 34ee12046bde · 2025-04-23T09:53:48.000-05:00
* Remove the if-else block in causal loop

* Remove unnecessary ( )

* fix format
diff --git a/python/perf-kernels/flash-attention.py b/python/perf-kernels/flash-attention.py
@@ -261,21 +261,29 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
         # We start from end of seqlen_k so only the first iteration would need
         # to be checked for padding if it is not a multiple of block_n
         # TODO: This can be optimized to only be true for the padded block.
+        mask = tl.full([BLOCK_M, BLOCK_N], True, dtype=tl.int1)
         if MASK_STEPS:
             # If this is the last block / iteration, we want to
             # mask if the sequence length is not a multiple of block size
             # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not is_modulo_mn.
             # last step might get wasted but that is okay. check if this masking works For
             # that case.
-            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
-                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
-                size_n = start_n + OFFS_N[None, :]
-                mask = size_n < boundary_m[:, None]
-                qk = tl.where(mask, qk, float("-inf"))
+
+            # remove the old if condition
+            # if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+            # Though this will unconditionally compute mask_partial at runtime,
+            # the causal for loop does not have the if-else block any more, which
+            # helps instruction scheduling and register pressure.
+            bound_cond = (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0)
+            boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
+            size_n = start_n + OFFS_N[None, :]
+            mask_partial = size_n < boundary_m[:, None]
+            mask = tl.where(bound_cond, mask_partial, mask)
         if IS_CAUSAL:
             causal_boundary = start_n + offs_n_causal
             causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
-            qk = tl.where(causal_mask, qk, float("-inf"))
+            mask = mask and causal_mask
+        qk = tl.where(mask, qk, float("-inf"))
         # -- compute qk ----
         if INT8_GEMM:
             qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) * QK_SCALE)
@@ -370,7 +378,7 @@ def is_hip():
 
 def is_cdna():
     return is_hip() and triton.runtime.driver.active.get_current_target().arch in ('gfx940', 'gfx941', 'gfx942',
-                                                                                   'gfx90a', 'gfx908')
+                                                                                   'gfx950', 'gfx90a', 'gfx908')
 
 
 def is_rdna():