improve patch for attention

xadupre · xadupre · commit 3323490ad461 · 2025-10-30T11:47:37.000+01:00
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -5,6 +5,8 @@
 import onnx_diagnostic.torch_export_patches.patches.patch_transformers as patch_transformers
 from onnx_diagnostic.ext_test_case import ExtTestCase, requires_transformers
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
+from onnx_diagnostic.export.shape_helper import make_fake_with_dynamic_dimensions
+from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
 class TestPatchPatchTransformers(ExtTestCase):
@@ -121,6 +123,57 @@ def test_causal_mask_in_scaled_dot_product_attention(self):
         attn_causal_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         self.assertEqual(attn_causal_bias.min().item(), -float("inf"))
 
+    # @ignore_warnings(UserWarning)
+    def test_causal_mask_in_scaled_dot_product_attention_export(self):
+        sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
+        patched_sdpa_attention_forward = patch_transformers.patched_sdpa_attention_forward
+        kwargs = {
+            "module": None,
+            "query": torch.rand((1, 2, 1, 96), dtype=torch.float32),
+            "key": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "value": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "attention_mask": None,
+            "attention_dropout": 0,
+            "scaling": 0.10206207261596575,
+            "is_causal": True,
+        }
+        expected = sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        self.assertEqualArray(expected, got)
+
+        class Model(torch.nn.Module):
+            def forward(self, query, key, value):
+                kwargs = {
+                    "module": None,
+                    "query": query,
+                    "key": key,
+                    "value": value,
+                    "attention_mask": None,
+                    "attention_dropout": 0,
+                    "scaling": 0.10206207261596575,
+                    "is_causal": True,
+                }
+                return patched_sdpa_attention_forward(**kwargs)[0]
+
+        query, key, value = kwargs["query"], kwargs["key"], kwargs["value"]
+        model = Model()
+        got = model(query, key, value)
+        self.assertEqualArray(expected, got)
+
+        # static export
+        ep = torch.export.export(model, (query, key, value))
+        got = ep.module()(query, key, value)
+        self.assertEqualArray(expected, got)
+
+        # dynamic
+        ds = ({0: "batch", 2: "seq1"}, {0: "batch", 2: "seq2"}, {0: "batch", 2: "seq2"})
+        fake_inputs, _ = make_fake_with_dynamic_dimensions((query, key, value), ds)
+        print("****", fake_inputs)
+        epd = torch.export.export(model, fake_inputs)  # , dynamic_shapes=use_dyn_not_str(ds))
+        print(epq)
+        got = epd.module()(query, key, value)
+        self.assertEqualArray(expected, got)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1374,6 +1374,14 @@ def patched_sdpa_attention_forward(
         attention_mask is None or attention_mask.shape[3] == key.shape[2],
         "Attention mask shape incompatible with key shape.",
     )
+    torch._check(
+        query.shape[0] == key.shape[0] or query.shape[0] == 1,
+        lambda: f"broadcast issue query (1): {query.shape}, key: {key.shape}, value: {value.shape}",
+    )
+    torch._check(
+        key.shape[0] == value.shape[0] or key.shape[0] == 1,
+        lambda: f"broadcast issue query (2): {query.shape}, key: {key.shape}, value: {value.shape}",
+    )
     if is_causal:
         attn_output = torch.cond(
             query.shape[2] > 1,  # distinction between prefill and decoding steps