fix

xadupre · xadupre · commit d3f8c0bc9714 · 2025-10-30T23:22:49.000+01:00
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -44,7 +44,6 @@ def test_sdpa_mask_recent_torch(self):
         got = patched_sdpa_mask_recent_torch(**kwargs)
         self.assertEqualArray(expected, got)
 
-    @requires_transformers("4.55")
     def test_sdpa_attention_forward_not_causal(self):
         sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
         patched_sdpa_attention_forward = patch_transformers.patched_sdpa_attention_forward
@@ -76,7 +75,6 @@ def test_sdpa_attention_forward_not_causal(self):
         got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
         self.assertEqualArray(expected, got)
 
-    @requires_transformers("4.55")
     def test_sdpa_attention_forward_causal(self):
         sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
         patched_sdpa_attention_forward = patch_transformers.patched_sdpa_attention_forward
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1351,6 +1351,25 @@ def patched_sdpa_attention_forward(
         "`sdpa` attention does not support `output_attentions=True`."
         " Please set your attention to `eager` if you want any of these features."
     )
+    torch._check(
+        attention_mask is None or attention_mask.shape[3] == key.shape[2],
+        "Attention mask shape incompatible with key shape.",
+    )
+    torch._check(
+        query.shape[0] == key.shape[0] or query.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (1): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+    torch._check(
+        key.shape[0] == value.shape[0] or key.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (2): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+
     sdpa_kwargs = {}
     if hasattr(module, "num_key_value_groups"):
         if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
@@ -1367,49 +1386,50 @@ def patched_sdpa_attention_forward(
         attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
     if patch_is_causal:
+        # transformers>=4.55
         is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
 
         # PATCHED: remove the test query.shape[2] > 1
         # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
         # and we split the test to keep the minimum in torch.cond
         is_causal = attention_mask is None and is_causal
-    elif is_causal is None:
-        is_causal = attention_mask is None
 
-    torch._check(
-        attention_mask is None or attention_mask.shape[3] == key.shape[2],
-        "Attention mask shape incompatible with key shape.",
-    )
-    torch._check(
-        query.shape[0] == key.shape[0] or query.shape[0] == 1,
-        lambda: (
-            f"broadcast issue query (1): {query.shape}, key: {key.shape}, "
-            f"value: {value.shape}"
-        ),
-    )
-    torch._check(
-        key.shape[0] == value.shape[0] or key.shape[0] == 1,
-        lambda: (
-            f"broadcast issue query (2): {query.shape}, key: {key.shape}, "
-            f"value: {value.shape}"
-        ),
-    )
-    if not is_causal or not patch_is_causal:
-        return (
-            torch.nn.functional.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask=attention_mask,
-                dropout_p=dropout,
-                scale=scaling,
-                is_causal=is_causal,
-                **sdpa_kwargs,
+        if not is_causal:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    else:
+        # transformers<4.55
+        if is_causal is None and attention_mask is not None:
+            is_causal = False
+        if is_causal is not None:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
             )
-            .transpose(1, 2)
-            .contiguous(),
-            None,
-        )
 
     # To avoid the following errors:
     # is_causal=query.shape[2] > 1