sdpython · sdpython · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -46,20 +46,26 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
 
         from transformers import AutoModel, AutoProcessor
 
-        # model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-        model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+        model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
+        # model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
         if os.environ.get("PRETRAINED", ""):
-            model = AutoModel.from_pretrained(model_id, device_map="auto", dtype="auto").eval()
+            print("-- pretrained model")
+            model = AutoModel.from_pretrained(
+                model_id, device_map=device, dtype=torch_dtype, attn_implementation="sdpa"
+            ).eval()
         else:
+            print("-- random model")
 
             def _config_reduction(config, task):
                 return {
-                    "num_hidden_layers": 2,
+                    # "num_hidden_layers": 2,
                     "text_config": {
                         "num_hidden_layers": 2,
                         "layer_types": ["full_attention", "full_attention"],
                     },
                     # "_attn_implementation": "flash_attention_2",
+                    "_attn_implementation": "sdpa",
+                    "dtype": "float16",
                 }
 
             config_reduction = _config_reduction
@@ -70,6 +76,7 @@ def _config_reduction(config, task):
 
         model = model.to(device).to(getattr(torch, dtype))
 
+        print(f"-- config._attn_implementation={model.config._attn_implementation}")
         print(f"-- model.dtype={model.dtype}")
         print(f"-- model.device={model.device}")
         processor = AutoProcessor.from_pretrained(model_id, use_fast=True)

diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -1011,15 +1011,20 @@ def test_imagetext2text_qwen_2_5_vl_instruct(self):
                 return_dict:bool
             )
         """
-        import transformers
-        from transformers import AutoModel, AutoProcessor
+        from transformers import AutoProcessor
         from qwen_vl_utils import process_vision_info
 
-        # model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-        model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+        model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
+        # model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
         if os.environ.get("PRETRAINED", ""):
-            model = AutoModel.from_pretrained(model_id, device_map="auto", dtype="auto").eval()
+            print("-- use pretrained model")
+            from transformers import Qwen2_5_VLForConditionalGeneration
+
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id, device_map="auto", dtype="auto", trust_remote_code=True
+            ).eval()
         else:
+            print("-- use dummy model")
 
             def config_reduction(config, task):
                 return {
@@ -1035,6 +1040,7 @@ def config_reduction(config, task):
             )
             model = data["model"]
 
+        print(f"-- model type={type(model)}")
         print(f"-- model.device={model.device}")
         processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
         print(f"-- processor={type(processor)}")
@@ -1063,25 +1069,13 @@ def config_reduction(config, task):
             padding=True,
             return_tensors="pt",
         )
-        inputs = inputs.to("cuda")
-        model = model.to("cuda").to(torch.bfloat16)
+        # model = model.to("cuda").to(torch.bfloat16)
+        # inputs = inputs.to("cuda")
 
         print(f"-- processor {type(processor)}")
         print(f"-- inputs={self.string_type(inputs, with_shape=True, with_min_max=True)}")
-
-        f_ = transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.apply_multimodal_rotary_pos_emb
-
-        def _apply_multimodal_rotary_pos_emb(*args, **kwargs):
-            print(
-                "-- apply_multimodal_rotary_pos_emb:",
-                self.string_type(args, with_shape=True),
-                self.string_type(kwargs, with_shape=True),
-            )
-            return f_(*args, **kwargs)
-
-        transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.apply_multimodal_rotary_pos_emb = (
-            _apply_multimodal_rotary_pos_emb
-        )
+        generated_ids = model.generate(**inputs, max_new_tokens=128)
+        print("-- second")
 
         print()
         with (

diff --git a/onnx_diagnostic/helpers/mini_onnx_builder.py b/onnx_diagnostic/helpers/mini_onnx_builder.py
@@ -422,6 +422,27 @@ def create_onnx_model_from_input_tensors(
     :return: ModelProto
 
     The function raises an error if not supported.
+    An example:
+
+    .. code-block:: python
+
+        from onnx_diagnostic.helpers.mini_onnx_builder import (
+            create_onnx_model_from_input_tensors,
+        )
+        import onnx
+
+        proto = create_onnx_model_from_input_tensors(
+            dict(
+                query_states=query_states,
+                key_states=key_states,
+                value_states=value_states,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=(cu_seqlens[1:] - cu_seqlens[:-1]).max(),
+                scaling=self.scaling,
+                attn_output=attn_output,
+            )
+        )
+        onnx.save(proto, "attention_inputs.onnx")
     """
     if switch_low_high is None:
         switch_low_high = sys.byteorder != "big"
@@ -461,7 +482,17 @@ def _unflatten(
         if spl[-1] == "array":
             return pos + 1, outputs[pos]
         if spl[-1] == "tensor":
-            return pos + 1, torch.from_numpy(outputs[pos]).to(device)
+            try:
+                return pos + 1, torch.from_numpy(outputs[pos]).to(device)
+            except TypeError:
+                # it shuold be more robusts
+                import ml_dtypes
+
+                if outputs[pos].dtype == ml_dtypes.bfloat16:
+                    return pos + 1, torch.from_numpy(outputs[pos].astype(float)).to(device).to(
+                        torch.bfloat16
+                    )
+                raise
         raise AssertionError(f"Unexpected name {name!r} in {names}")
 
     res: List[Any] = []
@@ -557,6 +588,19 @@ def create_input_tensors_from_onnx_model(
     :return: restored data
 
     See example :ref:`l-plot-intermediate-results` for an example.
+
+    .. code-bloc:: python
+
+        import os
+        from onnx_diagnostic.helpers.mini_onnx_builder import (
+            create_input_tensors_from_onnx_model,
+        )
+        from onnx_diagnostic.helpers import string_type
+
+        restored = create_input_tensors_from_onnx_model("attention_inputs.onnx")
+        for k, v in restored.items():
+            print(f"{k}: {string_type(v, with_shape=True, with_min_max=True)}")
+
     """
     if engine == "ExtendedReferenceEvaluator":
         from ..reference import ExtendedReferenceEvaluator