sdpython · sdpython · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/_unittests/ut_export/test_serialization.py b/_unittests/ut_export/test_serialization.py
@@ -88,7 +88,6 @@ def forward(self, cache):
             Model(), [(self._get_cache(),), (self._get_cache(bsize=3, nheads=5),)]
         )
         guessed = md.guess_dynamic_shapes()
-        print("****", guessed)
         DYN = torch.export.Dim.DYNAMIC
         self.assertEqual(
             (([{0: DYN, 1: DYN}, {0: DYN, 1: DYN}, {0: DYN, 1: DYN}, {0: DYN, 1: DYN}],), {}),

diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -14,7 +14,9 @@ class TestTryExportHuggingFaceHubModel(ExtTestCase):
     @ignore_warnings(UserWarning)
     def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
         """
-        clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k qwen_2_5
+        clear&&NEVERTEST=1 python _unittests/ut_tasks/try_export.py -k qwen_2_5
+
+        possible prefix: ``TEXTDEVICE=cuda TESTDTYPE=float16 EXPORTER=onnx-dynamo
 
         ::
 
@@ -33,6 +35,15 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
                 return_dict:bool
             )
         """
+        device = os.environ.get("TESTDEVICE", "cpu")
+        dtype = os.environ.get("TESTDTYPE", "float32")
+        torch_dtype = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32,
+        }[dtype]
+        exporter = os.environ.get("EXPORTER", "custom")
+
         from transformers import AutoModel, AutoProcessor
 
         # model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
@@ -57,28 +68,28 @@ def _config_reduction(config, task):
             )
             model = data["model"]
 
-        model = model.to("cpu").to(torch.float32)
+        model = model.to(device).to(getattr(torch, dtype))
 
+        print(f"-- model.dtype={model.dtype}")
         print(f"-- model.device={model.device}")
         processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
         print(f"-- processor={type(processor)}")
 
         inputs = dict(
-            hidden_states=torch.rand((1292, 1176), dtype=torch.float32),
-            grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64),
+            hidden_states=torch.rand((1292, 1176), dtype=torch_dtype).to(device),
+            grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64).to(device),
         )
 
         print(f"-- inputs: {self.string_type(inputs, with_shape=True)}")
         # this is too long
-        # expected = model.visual(**inputs)
-        # print(f"-- expected: {self.string_type(expected, with_shape=True)}")
+        expected = model.visual(**inputs)
+        print(f"-- expected: {self.string_type(expected, with_shape=True)}")
 
-        exporter = "custom"  # "onnx-dynamo"
         filename = self.get_dump_file(
-            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{exporter}.onnx"
+            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.onnx"
         )
         fileep = self.get_dump_file(
-            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{exporter}.graph"
+            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.graph"
         )
         dynamic_shapes = dict(
             hidden_states={0: "hidden_width", 1: "hidden_height"},
@@ -103,8 +114,27 @@ def _config_reduction(config, task):
                 exporter=exporter,
                 verbose=1,
                 save_ep=fileep,
+                target_opset=22,
+                optimize=True,
             )
 
+        self.assert_onnx_disc(
+            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}",
+            filename,
+            model.visual,
+            export_inputs,
+            verbose=1,
+            providers=(
+                ["CUDAExecutionProvider", "CPUExecutionProvider"]
+                if device == "cuda"
+                else ["CPUExecutionProvider"]
+            ),
+            use_ort=True,
+            atol=0.02,
+            rtol=10,
+            ort_optimized_graph=False,
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_tiny_llms_bypassed.py b/_unittests/ut_torch_models/test_tiny_llms_bypassed.py
@@ -28,8 +28,8 @@ def test_export_tiny_llm_2_bypassed(self):
             inputs = modificator(copy.deepcopy(inputs))
 
             def debug():
-                print("***", string_type(inputs, with_shape=True))
-                print("***", data["dynamic_shapes"])
+                print("--", string_type(inputs, with_shape=True))
+                print("--", data["dynamic_shapes"])
                 import torch.export._draft_export
 
                 _ep, report = torch.export._draft_export.draft_export(

diff --git a/_unittests/ut_torch_models/test_tiny_llms_onnx.py b/_unittests/ut_torch_models/test_tiny_llms_onnx.py
@@ -110,8 +110,8 @@ def test_bypass_onnx_export_tiny_llm_official_full(self):
         self.assertEqual(
             {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
         )
-        print("***", self.string_type(inputs, with_shape=True))
-        print("---", type(model))
+        print("--", self.string_type(inputs, with_shape=True))
+        print("--", type(model))
         with torch_export_patches(
             patch_transformers=True, verbose=1, stop_if_static=1
         ) as modificator:

diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -16,6 +16,7 @@ def to_onnx(
     exporter: str = "onnx-dynamo",
     exporter_kwargs: Optional[Dict[str, Any]] = None,
     save_ep: Optional[str] = None,
+    optimize: bool = True,
     use_control_flow_dispatcher: bool = False,
 ) -> Any:
     """
@@ -37,6 +38,7 @@ def to_onnx(
     :param exporter: exporter to use (``onnx-dynamo``, ``modelbuilder``, ``custom``)
     :param exporter_kwargs: additional parameters sent to the exporter
     :param save_ep: saves the exported program
+    :param optimize: optimizes the model
     :param use_control_flow_dispatcher: use the dispatcher created to supported
         custom loops (see :func:`onnx_diagnostic.export.control_flow.loop_for`)
     :return: the output of the selected exporter, usually a structure including
@@ -106,7 +108,8 @@ def to_onnx(
             dynamo=True,
             **(exporter_kwargs or {}),
         )
-        ort_fusions.optimize_for_ort(epo.model)
+        if optimize:
+            ort_fusions.optimize_for_ort(epo.model)
         if filename:
             epo.save(filename, external_data=True)
         return epo

diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
@@ -1188,6 +1188,7 @@ def assert_onnx_disc(
         copy_inputs: bool = True,
         expected: Optional[Any] = None,
         use_ort: bool = False,
+        ort_optimized_graph: bool = False,
         **kwargs,
     ):
         """
@@ -1206,6 +1207,7 @@ def assert_onnx_disc(
         :param expected: expected values
         :param copy_inputs: to copy the inputs
         :param use_ort: use :class:`onnxruntime.InferenceSession`
+        :param ort_optimized_graph: dumps the optimized onnxruntime graph
         :param kwargs: arguments sent to
             :class:`onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch`
         """
@@ -1216,29 +1218,50 @@ def assert_onnx_disc(
         kws = dict(with_shape=True, with_min_max=verbose > 1)
         vname = test_name or "assert_onnx_disc"
         if test_name:
+            import onnx
+
             name = f"{test_name}.onnx"
             if verbose:
                 print(f"[{vname}] save the onnx model into {name!r}")
-            name = self.dump_onnx(name, proto)
+            if isinstance(proto, str):
+                name = proto
+                proto = onnx.load(name)
+            else:
+                assert isinstance(
+                    proto, onnx.ModelProto
+                ), f"Unexpected type {type(proto)} for proto"
+                name = self.dump_onnx(name, proto)
             if verbose:
                 print(f"[{vname}] file size {os.stat(name).st_size // 2**10:1.3f} kb")
         if verbose:
             print(f"[{vname}] make feeds {string_type(inputs, **kws)}")
         if use_ort:
+            assert isinstance(
+                proto, onnx.ModelProto
+            ), f"Unexpected type {type(proto)} for proto"
             feeds = make_feeds(proto, inputs, use_numpy=True, copy=True)
-            if verbose:
-                print(f"[{vname}] feeds {string_type(feeds, **kws)}")
             import onnxruntime
 
+            if verbose:
+                print(f"[{vname}] create onnxruntime.InferenceSession")
+            options = onnxruntime.SessionOptions()
+            if ort_optimized_graph:
+                options.optimized_model_filepath = f"{name}.optort.onnx"
             sess = onnxruntime.InferenceSession(
-                proto.SerializeToString(), providers=["CPUExecutionProvider"]
+                proto.SerializeToString(),
+                options,
+                providers=kwargs.get("providers", ["CPUExecutionProvider"]),
             )
+            if verbose:
+                print(f"[{vname}] run ort feeds {string_type(feeds, **kws)}")
             got = sess.run(None, feeds)
         else:
             feeds = make_feeds(proto, inputs, copy=True)
             if verbose:
-                print(f"[{vname}] feeds {string_type(feeds, **kws)}")
+                print(f"[{vname}] create InferenceSessionForTorch")
             sess = InferenceSessionForTorch(proto, **kwargs)
+            if verbose:
+                print(f"[{vname}] run orttorch feeds {string_type(feeds, **kws)}")
             got = sess.run(None, feeds)
         if verbose:
             print(f"[{vname}] compute expected values")