sdpython · sdpython · Jun 14, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,8 @@ Change Logs
 0.7.0
 +++++
 
+* :pr:`146`: patch for IdeficsAttention, IdeficsEmbedding
+* :pr:`145`: patch for _compute_dynamic_ntk_parameters (Phi3RotaryEmbedding)
 * :pr:`144`: support for second inputs with different dimension,
   rename test_helper into validate,
   support ``interpolate_pos_encoding`` for ``VitModel``,

diff --git a/README.rst b/README.rst
@@ -22,7 +22,7 @@ onnx-diagnostic: investigate onnx models
 .. image:: https://img.shields.io/badge/code%20style-black-000000.svg
     :target: https://github.com/psf/black
 
-.. image:: https://codecov.io/gh/sdpython/onnx-diagnostic/branch/main/graph/badge.svg?token=Wb9ZGDta8J 
+.. image:: https://codecov.io/gh/sdpython/onnx-diagnostic/graph/badge.svg?token=91T5ZVIP96 
     :target: https://codecov.io/gh/sdpython/onnx-diagnostic
 
 The main feature is about `patches <https://github.com/sdpython/onnx-diagnostic/tree/main/onnx_diagnostic/torch_export_patches>`_:

diff --git a/_doc/examples/plot_export_hub_codellama.py b/_doc/examples/plot_export_hub_codellama.py
@@ -20,6 +20,7 @@
 import pprint
 import torch
 from onnx_diagnostic import doc
+from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.torch_models.hghub import (
     get_untrained_model_with_inputs,
@@ -32,7 +33,12 @@
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
-model_id = "codellama/CodeLlama-7b-Python-hf"
+model_id = (
+    "HuggingFaceM4/tiny-random-idefics"
+    if unit_test_going()
+    else "codellama/CodeLlama-7b-Python-hf"
+)
+print(f"model_id={model_id!r}")
 print("info", get_model_info(model_id))
 
 # %%

diff --git a/_doc/index.rst b/_doc/index.rst
@@ -15,7 +15,7 @@ onnx-diagnostic: investigate onnx models
 .. image:: https://img.shields.io/badge/code%20style-black-000000.svg
     :target: https://github.com/psf/black
 
-.. image:: https://codecov.io/gh/sdpython/onnx-diagnostic/branch/main/graph/badge.svg?token=Wb9ZGDta8J 
+.. image:: https://codecov.io/gh/sdpython/onnx-diagnostic/graph/badge.svg?token=91T5ZVIP96 
     :target: https://codecov.io/gh/sdpython/onnx-diagnostic
 
 The main feature is about `patches <https://github.com/sdpython/onnx-diagnostic/tree/main/onnx_diagnostic/torch_export_patches>`_:

diff --git a/_unittests/ut_helpers/test_doc_helper.py b/_unittests/ut_helpers/test_doc_helper.py
@@ -56,7 +56,7 @@ def test_custom_doc_kernels_layer_normalization(self):
         )
         expected = torch_sess.run(None, feeds)
         got = torch_sess_custom.run(None, feeds)
-        self.assertEqualAny(expected, got, atol=1e-3)
+        self.assertEqualAny(expected, got, atol=2e-3)
 
     def test_custom_doc_kernels_matmul(self):
         model = oh.make_model(

diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
@@ -584,7 +584,7 @@ def test_flatten_encoder_decoder_cache(self):
         self.assertIn("EncoderDecoderCache", s)
 
     def test_string_typeçconfig(self):
-        conf = get_pretrained_config("microsoft/phi-2")
+        conf = get_pretrained_config("microsoft/phi-2", use_only_preinstalled=True)
         s = string_type(conf)
         self.assertStartsWith("PhiConfig(**{", s)
 

diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -1,13 +1,20 @@
 import unittest
 import torch
-from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout, has_transformers, has_torch
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    hide_stdout,
+    requires_transformers,
+    requires_torch,
+)
 from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
 class TestTasks(ExtTestCase):
     @hide_stdout()
+    @requires_transformers("4.52")
+    @requires_torch("2.7.99")
     def test_image_text_to_text(self):
         mid = "HuggingFaceM4/tiny-random-idefics"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
@@ -16,10 +23,6 @@ def test_image_text_to_text(self):
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         model(**inputs)
         model(**data["inputs2"])
-        if not has_transformers("4.55"):
-            raise unittest.SkipTest("The model has control flow.")
-        if not has_torch("2.7.99"):
-            raise unittest.SkipTest("sym_max does not work with dynamic dimension")
         with torch_export_patches(patch_transformers=True, verbose=10):
             torch.export.export(
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False

diff --git a/_unittests/ut_torch_models/test_hghub_api.py b/_unittests/ut_torch_models/test_hghub_api.py
@@ -72,14 +72,16 @@ def test_task_from_id_long(self):
     @requires_torch("2.7")
     @hide_stdout()
     def test_get_pretrained_config(self):
-        conf = get_pretrained_config("microsoft/phi-2")
+        conf = get_pretrained_config("microsoft/phi-2", use_only_preinstalled=True)
         self.assertNotEmpty(conf)
 
     @requires_transformers("4.50")
     @requires_torch("2.7")
     @hide_stdout()
     def test_get_pretrained_config_options(self):
-        conf = get_pretrained_config("microsoft/phi-2", num_key_value_heads=16)
+        conf = get_pretrained_config(
+            "microsoft/phi-2", num_key_value_heads=16, use_only_preinstalled=True
+        )
         self.assertNotEmpty(conf)
         self.assertEqual(conf.num_key_value_heads, 16)
 

diff --git a/_unittests/ut_torch_models/test_hghub_model.py b/_unittests/ut_torch_models/test_hghub_model.py
@@ -75,16 +75,6 @@ def test_get_untrained_model_with_inputs_beit(self):
         # different expected value for different version of transformers
         self.assertIn((data["size"], data["n_weights"]), [(111448, 27862), (56880, 14220)])
 
-    @hide_stdout()
-    @ignore_errors(OSError)
-    def test_get_untrained_model_with_inputs_codellama(self):
-        mid = "codellama/CodeLlama-7b-Python-hf"
-        data = get_untrained_model_with_inputs(mid, verbose=1)
-        model, inputs = data["model"], data["inputs"]
-        model(**inputs)
-        # different expected value for different version of transformers
-        self.assertIn((data["size"], data["n_weights"]), [(547377152, 136844288)])
-
     @hide_stdout()
     @ignore_errors(OSError)
     def test_get_untrained_model_with_inputs_clip_vit(self):
@@ -129,11 +119,11 @@ def _diff(c1, c2):
                 try:
                     model(**inputs)
                 except Exception as e:
-                    diff = _diff(get_pretrained_config(mid), data["configuration"])
+                    cf = get_pretrained_config(mid, use_only_preinstalled=True)
+                    diff = _diff(cf, data["configuration"])
                     raise AssertionError(
                         f"Computation failed due to {e}.\n--- pretrained\n"
-                        f"{pprint.pformat(get_pretrained_config(mid))}\n"
-                        f"--- modified\n{data['configuration']}\n"
+                        f"{pprint.pformat(cf)}\n--- modified\n{data['configuration']}\n"
                         f"--- diff\n{diff}"
                     ) from e
                 # different expected value for different version of transformers

diff --git a/_unittests/ut_torch_models/test_validate_whole_models.py b/_unittests/ut_torch_models/test_validate_whole_models.py
@@ -270,7 +270,30 @@ def test_validate_phi35_mini_instruct(self):
             inputs2=True,
             patch=True,
             rewrite=True,
-            # model_options={"rope_scaling": {"rope_type": "dynamic", "factor": 10.0}},
+        )
+        self.assertIsInstance(summary, dict)
+        self.assertIsInstance(data, dict)
+        onnx_filename = data["onnx_filename"]
+        onx = onnx.load(onnx_filename)
+        op_types = set(n.op_type for n in onx.graph.node)
+        self.assertIn("If", op_types)
+
+    @requires_torch("2.7")
+    @hide_stdout()
+    @ignore_warnings(FutureWarning)
+    @requires_transformers("4.51")
+    def test_validate_phi35_4k_mini_instruct(self):
+        mid = "microsoft/Phi-3-mini-4k-instruct"
+        summary, data = validate_model(
+            mid,
+            do_run=True,
+            verbose=10,
+            exporter="custom",
+            dump_folder="dump_test/validate_phi35_mini_instruct",
+            inputs2=True,
+            patch=True,
+            rewrite=True,
+            model_options={"rope_scaling": {"rope_type": "dynamic", "factor": 10.0}},
         )
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)

diff --git a/_unittests/ut_xrun_doc/test_documentation_examples.py b/_unittests/ut_xrun_doc/test_documentation_examples.py
@@ -92,10 +92,22 @@ def add_test_methods(cls):
 
             if (
                 not reason
-                and name in {"plot_export_locate_issue.py", "plot_export_with_auto.py"}
-                and not has_torch("4.7")
+                and name in {"plot_export_hub_codellama.py"}
+                and not has_transformers("4.52")
             ):
-                reason = "torch<2.7"
+                reason = "transformers<4.52"
+
+            if (
+                not reason
+                and name
+                in {
+                    "plot_export_locate_issue.py",
+                    "plot_export_with_auto.py",
+                    "plot_export_hub_codellama.py",
+                }
+                and not has_torch("2.8")
+            ):
+                reason = "torch<2.8"
 
             if reason:
 

diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -818,3 +818,152 @@ def forward(self, x, position_ids):
             sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class patched_IdeficsEmbedding(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsEmbedding
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # if seq_len > self.max_seq_len_cached:
+        #    self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        def _set_cos_sin_cache_then(x, inv_freq, seq_len, _cos_cached, _sin_cached):
+            t = torch.arange(seq_len, device=x.device, dtype=torch.int64).type_as(inv_freq)
+            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            return emb.cos().to(x.dtype), emb.sin().to(x.dtype)
+
+        def _set_cos_sin_cache_else(_x, _inv_freq, _seq_len, cos_cached, sin_cached):
+            torch._check(seq_len.item() <= cos_cached.shape[0])
+            co = cos_cached[: seq_len.item()].detach().clone()
+            torch._check(seq_len.item() <= sin_cached.shape[0])
+            si = sin_cached[: seq_len.item()].detach().clone()
+            return co.to(dtype=x.dtype), si.to(dtype=x.dtype)
+
+        cos_cached, sin_cached = torch.cond(
+            (seq_len > self.max_seq_len_cached).item(),
+            _set_cos_sin_cache_then,
+            _set_cos_sin_cache_else,
+            [x, self.inv_freq, seq_len, self.cos_cached, self.sin_cached],
+        )
+        return cos_cached, sin_cached
+
+
+class patched_IdeficsAttention(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsAttention
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = (
+            self.q_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        if not is_cross_attention:
+            key_states = (
+                self.k_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+        else:
+            _, kv_len, _ = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += cache_position[0]
+
+        if not is_cross_attention:
+            rotary_length = torch.maximum(
+                torch.tensor(kv_seq_len, dtype=torch.int64),
+                torch.tensor(q_len, dtype=torch.int64),
+            )
+            cos, sin = self.rotary_emb(value_states, seq_len=rotary_length)
+            query_states, key_states = (
+                transformers.models.idefics.modeling_idefics.apply_rotary_pos_emb(
+                    query_states, key_states, cos, sin, position_ids
+                )
+            )
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models;
+            # cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        attention_interface: Callable = (
+            transformers.models.idefics.modeling_idefics.eager_attention_forward
+        )
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                transformers.models.idefics.modeling_idefics.logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support "
+                    "`output_attentions=True`. Falling back to "
+                    "eager attention. This warning can be removed using the argument "
+                    '`attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS[
+                    self.config._attn_implementation
+                ]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value