fix some more filter issues, address feedback

sudhakarsingh27 · sudhakarsingh27 · commit a545ebfe5816 · 2025-12-12T12:37:06.000-08:00
Signed-off-by: Sudhakar Singh &lt;sudhakars@nvidia.com&gt;
diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py
@@ -353,11 +353,11 @@ def test():
     backends = {0: "F16_max512_seqlen", 1: "F16_arbitrary_seqlen", 2: "FP8"}
     if AttentionLogging._is_logging_setup is False:
         AttentionLogging.setup_logging()
-    with logging_context(highest_level=AttentionLogging._log_level):
-        for i in range(3):
-            os.environ["NVTE_FUSED_ATTN_BACKEND"] = str(i)
-            _attention_backends["backend_selection_requires_update"] = True
-            available_backends, flash_attention_backend, fused_attention_backend = test()
-            if fused_attention_backend == FusedAttnBackend[backends[i]]:
-                fused_attn_backends.append(fused_attention_backend)
+
+    for i in range(3):
+        os.environ["NVTE_FUSED_ATTN_BACKEND"] = str(i)
+        _attention_backends["backend_selection_requires_update"] = True
+        available_backends, flash_attention_backend, fused_attention_backend = test()
+        if fused_attention_backend == FusedAttnBackend[backends[i]]:
+            fused_attn_backends.append(fused_attention_backend)
     return available_backends, flash_attention_backend, fused_attn_backends
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -347,6 +347,8 @@ def forward(
                 attention_mask=attention_mask,
                 window_size=window_size,
                 attention_type=self.attention_type,
+                bottom_right_alignment=(attn_mask_type not in ["causal", "padding_causal"]
+                                        if bottom_right_diagonal is None else bottom_right_diagonal)
             )
         )
 
@@ -450,8 +452,8 @@ def forward(
                     actual_seqlens_q=actual_seqlens_q if "padding" in attn_mask_type else None,
                     actual_seqlens_kv=actual_seqlens_kv if "padding" in attn_mask_type else None,
                     alibi_slopes=alibi_slopes,
-                    # (This should be replaced with `bottom_right_diagonal` which is passed from the arguments)
-                    bottom_right_alignment=attn_mask_type not in ["causal", "padding_causal"],
+                    bottom_right_alignment=(attn_mask_type not in ["causal", "padding_causal"]
+                                            if bottom_right_diagonal is None else bottom_right_diagonal)
                 )
             matmul_result = torch.baddbmm(
                 matmul_result,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -1280,7 +1280,6 @@ def forward(
                 if self.layer_number == 1:
                     _alibi_cache["_alibi_slopes_require_update"] = True
                     _alibi_cache["_alibi_bias_require_update"] = True
-            bottom_right_alignment = (attn_mask_type not in ["causal", "padding_causal"],)
             if core_attention_bias_type == "alibi":
                 assert (
                     core_attention_bias is None
@@ -1289,7 +1288,7 @@ def forward(
                     _alibi_cache["_num_heads"] != query_layer.shape[-2]
                     or _alibi_cache["_max_seqlen_q"] != max_seqlen_q
                     or _alibi_cache["_max_seqlen_kv"] != max_seqlen_kv
-                    or _alibi_cache["_bottom_right_alignment"] != bottom_right_alignment
+                    or _alibi_cache["_bottom_right_alignment"] != bottom_right_diagonal
                     or _alibi_cache["_alibi_slopes"] is None
                 ):
                     _alibi_cache["_alibi_slopes_require_update"] = True
@@ -1471,7 +1470,7 @@ def forward(
                 fu_core_attention_bias_type = core_attention_bias_type
                 fu_core_attention_bias = core_attention_bias
                 if core_attention_bias_type == "alibi" and (
-                    alibi_slopes is not None or max_seqlen_q != max_seqlen_kv
+                    alibi_slopes is not None
                 ):
                     fu_core_attention_bias_type = "post_scale_bias"
                     _, fu_core_attention_bias = dpa_utils.get_alibi(
@@ -1481,7 +1480,7 @@ def forward(
                         max_seqlen_kv,
                         alibi_slopes=alibi_slopes,
                         bias_dtype=query_layer.dtype,
-                        bottom_right_alignment=attn_mask_type not in ["causal", "padding_causal"],
+                        bottom_right_alignment=bottom_right_diagonal,
                     )
                 if checkpoint_core_attention:
                     return self._checkpointed_attention_forward(
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -200,7 +200,7 @@ class AttentionParams:
         `causal_bottom_right`, `padding_causal_bottom_right`, `arbitrary`}
     window_size : Tuple[int, int], default = None
         Sliding window attention size.
-    bottom_right_diagonal: bool, default = `True`
+    bottom_right_diagonal: bool, default = `None`
         Whether to align sliding window and ALiBi diagonal to the bottom right corner
         of the softmax matrix.
     alibi_slopes_shape : Optional[Union[torch.Size, List]], default = None
@@ -962,7 +962,7 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
     if (
         use_fused_attention
         and core_attention_bias_type == "alibi"
-        and (alibi_slopes_shape is not None or max_seqlen_q != max_seqlen_kv)
+        and (alibi_slopes_shape is not None)
     ):
         fu_core_attention_bias_type = "post_scale_bias"
         fu_core_attention_bias_requires_grad = False

Original file line number	Diff line number	Diff line change
`@@ -347,6 +347,8 @@ def forward(`
`347`	`347`	`attention_mask=attention_mask,`
`348`	`348`	`window_size=window_size,`
`349`	`349`	`attention_type=self.attention_type,`
	`350`	`+ bottom_right_alignment=(attn_mask_type not in ["causal", "padding_causal"]`
	`351`	`+ if bottom_right_diagonal is None else bottom_right_diagonal)`
`350`	`352`	`)`
`351`	`353`	`)`
`352`	`354`
`@@ -450,8 +452,8 @@ def forward(`
`450`	`452`	`actual_seqlens_q=actual_seqlens_q if "padding" in attn_mask_type else None,`
`451`	`453`	`actual_seqlens_kv=actual_seqlens_kv if "padding" in attn_mask_type else None,`
`452`	`454`	`alibi_slopes=alibi_slopes,`
`453`		- # (This should be replaced with `bottom_right_diagonal` which is passed from the arguments)
`454`		`- bottom_right_alignment=attn_mask_type not in ["causal", "padding_causal"],`
	`455`	`+ bottom_right_alignment=(attn_mask_type not in ["causal", "padding_causal"]`
	`456`	`+ if bottom_right_diagonal is None else bottom_right_diagonal)`
`455`	`457`	`)`
`456`	`458`	`matmul_result = torch.baddbmm(`
`457`	`459`	`matmul_result,`