fix issues

xadupre · xadupre · commit 2523b0da68b3 · 2025-10-29T14:32:57.000+01:00
diff --git a/_doc/technical/plot_generate.py b/_doc/technical/plot_generate.py
@@ -155,7 +155,7 @@ def simple_generate_with_cache(
 dtype = get_weight_type(model)
 print("-- model dtype:", dtype)
 export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
-exporter = "custom" if "custom" in sys.argv else "onnx-dynamo"
+exporter = "onnx-dynamo" if "dynamo" in sys.argv else "custom"
 model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
 if not os.path.exists(model_name):
     # This step is slow so let's skip it if it was already done.
diff --git a/_unittests/ut_export/test_api.py b/_unittests/ut_export/test_api.py
@@ -1,9 +1,10 @@
 import unittest
 import torch
-from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout, has_transformers
 from onnx_diagnostic.helpers import max_diff
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
 from onnx_diagnostic.helpers.rt_helper import make_feeds
+from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache
 from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.export.api import to_onnx
@@ -46,6 +47,10 @@ def test_tiny_llm_to_onnx(self):
             "onnx-dynamo": self.get_dump_file("test_tiny_llm_to_onnx-dynamo.onnx"),
             "modelbuilder": self.get_dump_file("model.onnx"),
         }
+        if not has_transformers("4.55"):
+            # <4.55: torch._check(causal_mask.shape[3] != 33)
+            #        torch._check(causal_mask.shape[3] == 33)
+            del filenames["onnx-dynamo"]
         del inputs["position_ids"]
         del ds["position_ids"]
         del b1["position_ids"]
@@ -72,14 +77,24 @@ def test_tiny_llm_to_onnx(self):
                 diff = max_diff(expected, got)
                 assert diff["abs"] <= 1e-5, f"diff={diff}"
 
-        b1["attention_mask"][:, :] = 1
-        expected = model(**torch_deepcopy(b1))
+        problem = dict(
+            input_ids=torch.tensor([[24320]], dtype=torch.int64),
+            attention_mask=torch.tensor([[1, 1, 1, 1]], dtype=torch.int64),
+            past_key_values=make_dynamic_cache(
+                [
+                    torch.rand((1, 1, 3, 96), dtype=torch.float32),
+                    torch.rand((1, 1, 3, 96), dtype=torch.float32),
+                ]
+            ),
+        )
+
+        expected = model(**torch_deepcopy(problem))
         for exporter, filename in filenames.items():
             with self.subTest(exporter=f"full-mask-{exporter}"):
                 sess = onnxruntime.InferenceSession(
                     filename, providers=["CPUExecutionProvider"]
                 )
-                feeds = make_feeds(sess, b1, use_numpy=True)
+                feeds = make_feeds(sess, problem, use_numpy=True)
                 got = sess.run(None, feeds)
                 diff = max_diff(expected, got)
                 assert diff["abs"] <= 1e-5, f"diff={diff}"
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -48,7 +48,8 @@ def simple_generate_with_cache(
             f"\ninput_ids.shape={input_ids.shape}"
             f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
             f"\n     got=\n"
-            f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}"
+            f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+            f"feeds={self.string_type(feeds, with_shape=True, with_min_max=True)}"
         )
 
         # Next calls: decode
@@ -87,7 +88,8 @@ def simple_generate_with_cache(
                 f"\ndiff={diff}\ninput_ids.shape={input_ids.shape}"
                 f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
                 f"\n     got=\n"
-                f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}"
+                f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+                f"feeds={self.string_type(feeds, with_shape=True, with_min_max=True)}"
             )
         return input_ids
 
@@ -113,7 +115,7 @@ def test_onnx_generate(self):
                 kwargs=inputs,
                 dynamic_shapes=ds,
                 filename=model_name,
-                exporter="modelbuilder",
+                exporter="custom",
             )
 
             print("-- test_onnx_generate: generate")
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -0,0 +1,126 @@
+import unittest
+import torch
+import transformers
+import transformers.integrations.sdpa_attention as sdpa_attention
+import onnx_diagnostic.torch_export_patches.patches.patch_transformers as patch_transformers
+from onnx_diagnostic.ext_test_case import ExtTestCase, requires_transformers
+from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
+
+
+class TestPatchPatchTransformers(ExtTestCase):
+    @requires_transformers("4.55")
+    def test_sdpa_mask_recent_torch(self):
+        sdpa_mask_recent_torch = transformers.masking_utils.sdpa_mask_recent_torch
+        patched_sdpa_mask_recent_torch = patch_transformers.patched_sdpa_mask_recent_torch
+        kwargs = {
+            "batch_size": 1,
+            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "kv_length": 4,
+            "kv_offset": 0,
+            "mask_function": transformers.masking_utils.causal_mask_function,
+            "attention_mask": torch.tensor([[True, True, True, True]]),
+            "local_size": None,
+            "allow_is_causal_skip": True,
+            "allow_is_bidirectional_skip": False,
+        }
+        expected = sdpa_mask_recent_torch(**kwargs)
+        got = patched_sdpa_mask_recent_torch(**kwargs)
+        self.assertEqual(expected, got)
+
+        kwargs = {
+            "batch_size": 1,
+            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "kv_length": 4,
+            "kv_offset": 0,
+            "mask_function": transformers.masking_utils.causal_mask_function,
+            "attention_mask": torch.tensor([[True, True, True, True]]),
+            "local_size": None,
+            "allow_is_causal_skip": False,
+            "allow_is_bidirectional_skip": False,
+        }
+        expected = sdpa_mask_recent_torch(**kwargs)
+        got = patched_sdpa_mask_recent_torch(**kwargs)
+        self.assertEqualArray(expected, got)
+
+    @requires_transformers("4.55")
+    def test_sdpa_attention_forward_not_causal(self):
+        sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
+        patched_sdpa_attention_forward = patch_transformers.patched_sdpa_attention_forward
+        kwargs = {
+            "module": None,
+            "query": torch.rand((1, 2, 1, 96), dtype=torch.float32),
+            "key": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "value": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "attention_mask": None,
+            "attention_dropout": 0,
+            "scaling": 0.10206207261596575,
+            "is_causal": False,
+        }
+        expected = sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        self.assertEqualArray(expected, got)
+
+        kwargs = {
+            "module": None,
+            "query": torch.rand((1, 2, 1, 96), dtype=torch.float32),
+            "key": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "value": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "attention_mask": torch.tensor([[[[True, True, True, True]]]]),
+            "attention_dropout": 0,
+            "scaling": 0.10206207261596575,
+            "is_causal": False,
+        }
+        expected = sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        self.assertEqualArray(expected, got)
+
+    @requires_transformers("4.55")
+    def test_sdpa_attention_forward_causal(self):
+        sdpa_attention_forward = sdpa_attention.sdpa_attention_forward
+        patched_sdpa_attention_forward = patch_transformers.patched_sdpa_attention_forward
+        kwargs = {
+            "module": None,
+            "query": torch.rand((1, 2, 1, 96), dtype=torch.float32),
+            "key": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "value": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "attention_mask": torch.tensor([[[[True, True, True, True]]]]),
+            "attention_dropout": 0,
+            "scaling": 0.10206207261596575,
+            "is_causal": True,
+        }
+        expected = sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        self.assertEqualArray(expected, got)
+
+        kwargs = {
+            "module": None,
+            "query": torch.rand((1, 2, 1, 96), dtype=torch.float32),
+            "key": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "value": torch.rand((1, 2, 4, 96), dtype=torch.float32),
+            "attention_mask": None,
+            "attention_dropout": 0,
+            "scaling": 0.10206207261596575,
+            "is_causal": True,
+        }
+        expected = sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        got = patched_sdpa_attention_forward(**torch_deepcopy(kwargs))[0]
+        self.assertEqualArray(expected, got)
+
+    def test_causal_mask_in_scaled_dot_product_attention(self):
+        # see https://docs.pytorch.org/docs/stable/generated/...
+        #       ...torch.nn.functional.scaled_dot_product_attention.html
+
+        query = torch.rand((1, 2, 1, 96), dtype=torch.float32)
+        key = torch.rand((1, 2, 4, 96), dtype=torch.float32)
+        L, S = query.size(-2), key.size(-2)
+        attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+        self.assertEqual(attn_bias.min().item(), 0)
+        attn_causal_bias = attn_bias.clone()
+
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_causal_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        self.assertEqual(attn_causal_bias.min().item(), -float("inf"))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
@@ -856,9 +856,15 @@ def torch_deepcopy(value: Any) -> Any:
         ), f"Unexpected type={type(value)}"
         return copy.deepcopy(value)
 
+    if hasattr(value, "__nocopy__"):
+        return value
+
     # We should have a code using serialization, deserialization assuming a model
     # cannot be exported without them.
-    raise NotImplementedError(f"torch_deepcopy not implemented for type {type(value)}")
+    raise NotImplementedError(
+        f"torch_deepcopy not implemented for type {type(value)}, "
+        f"add attribute '__nocopy__' to return it as is."
+    )
 
 
 def torch_tensor_size(value: Any) -> Any:
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -39,19 +39,45 @@
 except ImportError:
     patch_DynamicLayer = False
 
-from ...ext_test_case import has_transformers
-from ...helpers.torch_helper import is_torchdynamo_exporting
 
-patch_is_initialized = pv.Version(transformers.__version__) > pv.Version("4.56.99")
+def _has_transformers(version: str) -> bool:
+    return pv.Version(transformers.__version__) >= pv.Version(version)
+
+
+def _is_torchdynamo_exporting() -> bool:
+    """
+    Tells if :epkg:`torch` is exporting a model.
+    Relies on ``torch.compiler.is_exporting()``.
+    """
+    import torch
+
+    if not hasattr(torch.compiler, "is_exporting"):
+        # torch.compiler.is_exporting requires torch>=2.7
+        return False
+
+    try:
+        return torch.compiler.is_exporting()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo
+
+            return dynamo.is_exporting()  # type: ignore
+        except Exception:
+            return False
+
+
+patch_is_initialized = _has_transformers("4.56.99")
 
 
 if patch_masking_utils:
     # Introduced in 4.52
     from transformers.masking_utils import (
+        _ignore_causal_mask_sdpa,
+        _ignore_bidirectional_mask_sdpa,
+        and_masks,
+        bidirectional_mask_function,
         causal_mask_function,
         padding_mask_function,
-        and_masks,
-        _ignore_causal_mask_sdpa,
         prepare_padding_mask,
     )
 
@@ -98,7 +124,7 @@ def vector_mask_function(
             #    for a, dims in zip(args, udimensions)
             # ]
             max_shape = tuple(args[i].shape[0] for i in indices)
-            # if is_torchdynamo_exporting():
+            # if _is_torchdynamo_exporting():
             #     for a in args:
             #         # The exporter should export with a dimension > 1
             #         # to make sure it is dynamic.
@@ -151,6 +177,7 @@ def patched_sdpa_mask_recent_torch(
         attention_mask: Optional[torch.Tensor] = None,
         local_size: Optional[int] = None,
         allow_is_causal_skip: bool = True,
+        allow_is_bidirectional_skip: bool = False,
         **kwargs,
     ) -> Optional[torch.Tensor]:
         """manual patch for function ``transformers.masking_utils.sdpa_mask_recent_torch``."""
@@ -160,6 +187,25 @@ def patched_sdpa_mask_recent_torch(
             padding_mask, q_length, kv_length, kv_offset, local_size
         ):
             return None
+        if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask):
+            return None
+
+        if mask_function is bidirectional_mask_function:
+            if padding_mask is not None:
+                # used for slicing without data-dependent slicing
+                mask_indices = (
+                    torch.arange(kv_length, device=cache_position.device) + kv_offset
+                )
+                return padding_mask[:, None, None, mask_indices].expand(-1, -1, q_length, -1)
+            return torch.ones(
+                batch_size,
+                1,
+                q_length,
+                kv_length,
+                dtype=torch.bool,
+                device=cache_position.device,
+            )
+
         kv_arange = torch.arange(kv_length, device=cache_position.device)
         kv_arange += kv_offset
         if padding_mask is not None:
@@ -275,7 +321,7 @@ class patched_AttentionMaskConverter:
     """
 
     # This method was fixed in 4.51 at least.
-    _PATCHES_ = ["_make_causal_mask"] if not has_transformers("4.48.3") else []
+    _PATCHES_ = ["_make_causal_mask"] if not _has_transformers("4.48.3") else []
     _PATCHED_CLASS_ = AttentionMaskConverter
 
     @staticmethod
@@ -507,7 +553,7 @@ def _cache_dependant_input_preparation(
         The current implementation does not rely on ``self`` and could be
         a class method. It is left as a standard method to be easily rewritten.
         """
-        if is_torchdynamo_exporting():
+        if _is_torchdynamo_exporting():
             return self._cache_dependant_input_preparation_exporting(
                 input_ids, inputs_embeds, cache_position
             )
@@ -1316,16 +1362,40 @@ def patched_sdpa_attention_forward(
         attention_mask is None or attention_mask.shape[3] == key.shape[2],
         "Attention mask shape incompatible with key shape.",
     )
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=attention_mask,
-        dropout_p=dropout,
-        scale=scaling,
-        is_causal=is_causal,
-        **sdpa_kwargs,
-    )
+    if is_causal:
+        attn_output = torch.cond(
+            query.shape[2] > 1,  # distinction between prefill and decoding steps
+            lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                dropout_p=dropout,
+                scale=scaling,
+                is_causal=True,
+                **sdpa_kwargs,
+            ),
+            lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                dropout_p=dropout,
+                scale=scaling,
+                is_causal=False,
+                **sdpa_kwargs,
+            ),
+            [query, key, value],
+        )
+    else:
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=is_causal,
+            **sdpa_kwargs,
+        )
     attn_output = attn_output.transpose(1, 2).contiguous()
     return attn_output, None