more patches

xadupre · xadupre · commit ca4eb76c4d61 · 2025-06-14T14:49:27.000+02:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.7.0
 +++++
 
+* :pr:`146`: patch for IdeficsAttention, IdeficsEmbedding
 * :pr:`145`: patch for _compute_dynamic_ntk_parameters (Phi3RotaryEmbedding)
 * :pr:`144`: support for second inputs with different dimension,
   rename test_helper into validate,
diff --git a/_doc/examples/plot_export_hub_codellama.py b/_doc/examples/plot_export_hub_codellama.py
@@ -20,6 +20,7 @@
 import pprint
 import torch
 from onnx_diagnostic import doc
+from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.torch_models.hghub import (
     get_untrained_model_with_inputs,
@@ -32,7 +33,12 @@
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
-model_id = "codellama/CodeLlama-7b-Python-hf"
+model_id = (
+    "HuggingFaceM4/tiny-random-idefics"
+    if unit_test_going()
+    else "codellama/CodeLlama-7b-Python-hf"
+)
+print(f"model_id={model_id!r}")
 print("info", get_model_info(model_id))
 
 # %%
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -1,13 +1,20 @@
 import unittest
 import torch
-from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout, has_transformers, has_torch
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    hide_stdout,
+    requires_transformers,
+    requires_torch,
+)
 from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 
 
 class TestTasks(ExtTestCase):
     @hide_stdout()
+    @requires_transformers("4.52")
+    @requires_torch("2.7.99")
     def test_image_text_to_text(self):
         mid = "HuggingFaceM4/tiny-random-idefics"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
@@ -16,10 +23,6 @@ def test_image_text_to_text(self):
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         model(**inputs)
         model(**data["inputs2"])
-        if not has_transformers("4.55"):
-            raise unittest.SkipTest("The model has control flow.")
-        if not has_torch("2.7.99"):
-            raise unittest.SkipTest("sym_max does not work with dynamic dimension")
         with torch_export_patches(patch_transformers=True, verbose=10):
             torch.export.export(
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -818,3 +818,152 @@ def forward(self, x, position_ids):
             sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class patched_IdeficsEmbedding(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsEmbedding
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # if seq_len > self.max_seq_len_cached:
+        #    self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        def _set_cos_sin_cache_then(x, inv_freq, seq_len, _cos_cached, _sin_cached):
+            t = torch.arange(seq_len, device=x.device, dtype=torch.int64).type_as(inv_freq)
+            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            return emb.cos().to(x.dtype), emb.sin().to(x.dtype)
+
+        def _set_cos_sin_cache_else(_x, _inv_freq, _seq_len, cos_cached, sin_cached):
+            torch._check(seq_len.item() <= cos_cached.shape[0])
+            co = cos_cached[: seq_len.item()].detach().clone()
+            torch._check(seq_len.item() <= sin_cached.shape[0])
+            si = sin_cached[: seq_len.item()].detach().clone()
+            return co.to(dtype=x.dtype), si.to(dtype=x.dtype)
+
+        cos_cached, sin_cached = torch.cond(
+            (seq_len > self.max_seq_len_cached).item(),
+            _set_cos_sin_cache_then,
+            _set_cos_sin_cache_else,
+            [x, self.inv_freq, seq_len, self.cos_cached, self.sin_cached],
+        )
+        return cos_cached, sin_cached
+
+
+class patched_IdeficsAttention(torch.nn.Module):
+    _PATCHES_ = ["forward"]
+    _PATCHED_CLASS_ = transformers.models.idefics.modeling_idefics.IdeficsAttention
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = (
+            self.q_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        if not is_cross_attention:
+            key_states = (
+                self.k_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(hidden_states)
+                .view(bsz, q_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+        else:
+            _, kv_len, _ = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += cache_position[0]
+
+        if not is_cross_attention:
+            rotary_length = torch.maximum(
+                torch.tensor(kv_seq_len, dtype=torch.int64),
+                torch.tensor(q_len, dtype=torch.int64),
+            )
+            cos, sin = self.rotary_emb(value_states, seq_len=rotary_length)
+            query_states, key_states = (
+                transformers.models.idefics.modeling_idefics.apply_rotary_pos_emb(
+                    query_states, key_states, cos, sin, position_ids
+                )
+            )
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models;
+            # cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        attention_interface: Callable = (
+            transformers.models.idefics.modeling_idefics.eager_attention_forward
+        )
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                transformers.models.idefics.modeling_idefics.logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support "
+                    "`output_attentions=True`. Falling back to "
+                    "eager attention. This warning can be removed using the argument "
+                    '`attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS[
+                    self.config._attn_implementation
+                ]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
diff --git a/onnx_diagnostic/torch_models/hghub/hub_api.py b/onnx_diagnostic/torch_models/hghub/hub_api.py
@@ -59,9 +59,11 @@ def get_cached_configuration(
             conf = copy.deepcopy(conf)
             update_config(conf, kwargs)
         return conf
-    assert not exc and not os.environ.get(
-        "NOHTTP", ""
-    ), f"Unable to find {name!r} in {pprint.pformat(sorted(cached))}"
+    assert not exc and not os.environ.get("NOHTTP", ""), (
+        f"Unable to find {name!r} (exc={exc}, "
+        f"NOHTTP={os.environ.get('NOHTTP', '')!r}) "
+        f"in {pprint.pformat(sorted(cached))}"
+    )
     return None
 
 
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -4160,63 +4160,3 @@ def _ccached_ydshieh_tiny_random_vit_for_image_classification():
             "transformers_version": "4.24.0.dev0",
         }
     )
-
-
-def _ccached_huggingface_tiny_random_idefics():
-    "HuggingFaceM4/tiny-random-idefics"
-    return transformers.Phi3Config(
-        **{
-            "additional_vocab_size": 2,
-            "alpha_initializer": "ones",
-            "alpha_type": "vector",
-            "alphas_initializer_range": 0.0,
-            "architectures": ["IdeficsForVisionText2Text"],
-            "bos_token_id": 1,
-            "cross_layer_activation_function": "swiglu",
-            "cross_layer_interval": 1,
-            "dropout": 0.0,
-            "eos_token_id": 2,
-            "ffn_dim": 64,
-            "freeze_lm_head": false,
-            "freeze_text_layers": false,
-            "freeze_text_module_exceptions": [],
-            "freeze_vision_layers": false,
-            "freeze_vision_module_exceptions": [],
-            "hidden_act": "silu",
-            "hidden_size": 16,
-            "initializer_range": 0.02,
-            "intermediate_size": 11008,
-            "max_new_tokens": 128,
-            "max_position_embeddings": 128,
-            "model_type": "idefics",
-            "num_attention_heads": 4,
-            "num_hidden_layers": 2,
-            "pad_token_id": 0,
-            "qk_layer_norms": false,
-            "rms_norm_eps": 1e-06,
-            "tie_word_embeddings": false,
-            "torch_dtype": "float16",
-            "transformers_version": "4.27.0.dev0",
-            "use_cache": true,
-            "use_resampler": true,
-            "vocab_size": 32000,
-            "word_embed_proj_dim": 16,
-            "vision_config": {
-                "hidden_act": "gelu",
-                "embed_dim": 32,
-                "image_size": 30,
-                "intermediate_size": 37,
-                "patch_size": 2,
-                "num_attention_heads": 4,
-                "num_hidden_layers": 5,
-                "vision_model_name": "hf-internal-testing/tiny-random-clip",
-            },
-            "perceiver_config": {
-                "qk_layer_norms_perceiver": false,
-                "resampler_depth": 2,
-                "resampler_head_dim": 8,
-                "resampler_n_heads": 2,
-                "resampler_n_latents": 16,
-            },
-        }
-    )
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -26,7 +26,7 @@ def get_untrained_model_with_inputs(
     use_preinstalled: bool = True,
     add_second_input: bool = False,
     subfolder: Optional[str] = None,
-    use_only_preinstalled: bool = True,
+    use_only_preinstalled: bool = False,
 ) -> Dict[str, Any]:
     """
     Gets a non initialized model similar to the original model