fix

akoumpa · akoumpa · commit 04fb006064ea · 2026-02-25T10:22:00.000-08:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/nemo_automodel/components/models/llama_bidirectional/model.py b/nemo_automodel/components/models/llama_bidirectional/model.py
@@ -106,6 +106,10 @@ def _update_causal_mask(
     ):
         if attention_mask is None:
             return None
+        if getattr(self.config, "_attn_implementation", None) == "flash_attention_2":
+            # Flash Attention handles padding from the raw 2D mask;
+            # bidirectional attention is ensured by is_causal=False on all layers.
+            return attention_mask
         dtype = input_tensor.dtype if input_tensor is not None else torch.float32
         return _prepare_4d_attention_mask(attention_mask, dtype)