diff --git a/_unittests/ut_export/test_serialization.py b/_unittests/ut_export/test_serialization.py index 070a76f1..a7201e1a 100644 --- a/_unittests/ut_export/test_serialization.py +++ b/_unittests/ut_export/test_serialization.py @@ -88,7 +88,6 @@ def forward(self, cache): Model(), [(self._get_cache(),), (self._get_cache(bsize=3, nheads=5),)] ) guessed = md.guess_dynamic_shapes() - print("****", guessed) DYN = torch.export.Dim.DYNAMIC self.assertEqual( (([{0: DYN, 1: DYN}, {0: DYN, 1: DYN}, {0: DYN, 1: DYN}, {0: DYN, 1: DYN}],), {}), diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py index a08c9e6f..bc1170df 100644 --- a/_unittests/ut_tasks/try_export.py +++ b/_unittests/ut_tasks/try_export.py @@ -14,7 +14,9 @@ class TestTryExportHuggingFaceHubModel(ExtTestCase): @ignore_warnings(UserWarning) def test_imagetext2text_qwen_2_5_vl_instruct_visual(self): """ - clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k qwen_2_5 + clear&&NEVERTEST=1 python _unittests/ut_tasks/try_export.py -k qwen_2_5 + + possible prefix: ``TEXTDEVICE=cuda TESTDTYPE=float16 EXPORTER=onnx-dynamo :: @@ -33,6 +35,15 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self): return_dict:bool ) """ + device = os.environ.get("TESTDEVICE", "cpu") + dtype = os.environ.get("TESTDTYPE", "float32") + torch_dtype = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + }[dtype] + exporter = os.environ.get("EXPORTER", "custom") + from transformers import AutoModel, AutoProcessor # model_id = "Qwen/Qwen2.5-VL-7B-Instruct" @@ -57,28 +68,28 @@ def _config_reduction(config, task): ) model = data["model"] - model = model.to("cpu").to(torch.float32) + model = model.to(device).to(getattr(torch, dtype)) + print(f"-- model.dtype={model.dtype}") print(f"-- model.device={model.device}") processor = AutoProcessor.from_pretrained(model_id, use_fast=True) print(f"-- processor={type(processor)}") inputs = dict( - hidden_states=torch.rand((1292, 1176), dtype=torch.float32), - grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64), + hidden_states=torch.rand((1292, 1176), dtype=torch_dtype).to(device), + grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64).to(device), ) print(f"-- inputs: {self.string_type(inputs, with_shape=True)}") # this is too long - # expected = model.visual(**inputs) - # print(f"-- expected: {self.string_type(expected, with_shape=True)}") + expected = model.visual(**inputs) + print(f"-- expected: {self.string_type(expected, with_shape=True)}") - exporter = "custom" # "onnx-dynamo" filename = self.get_dump_file( - f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{exporter}.onnx" + f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.onnx" ) fileep = self.get_dump_file( - f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{exporter}.graph" + f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.graph" ) dynamic_shapes = dict( hidden_states={0: "hidden_width", 1: "hidden_height"}, @@ -103,8 +114,27 @@ def _config_reduction(config, task): exporter=exporter, verbose=1, save_ep=fileep, + target_opset=22, + optimize=True, ) + self.assert_onnx_disc( + f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}", + filename, + model.visual, + export_inputs, + verbose=1, + providers=( + ["CUDAExecutionProvider", "CPUExecutionProvider"] + if device == "cuda" + else ["CPUExecutionProvider"] + ), + use_ort=True, + atol=0.02, + rtol=10, + ort_optimized_graph=False, + ) + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/_unittests/ut_torch_models/test_tiny_llms_bypassed.py b/_unittests/ut_torch_models/test_tiny_llms_bypassed.py index 8c3b2bd5..2b49da86 100644 --- a/_unittests/ut_torch_models/test_tiny_llms_bypassed.py +++ b/_unittests/ut_torch_models/test_tiny_llms_bypassed.py @@ -28,8 +28,8 @@ def test_export_tiny_llm_2_bypassed(self): inputs = modificator(copy.deepcopy(inputs)) def debug(): - print("***", string_type(inputs, with_shape=True)) - print("***", data["dynamic_shapes"]) + print("--", string_type(inputs, with_shape=True)) + print("--", data["dynamic_shapes"]) import torch.export._draft_export _ep, report = torch.export._draft_export.draft_export( diff --git a/_unittests/ut_torch_models/test_tiny_llms_onnx.py b/_unittests/ut_torch_models/test_tiny_llms_onnx.py index fa5b445d..7b3a6793 100644 --- a/_unittests/ut_torch_models/test_tiny_llms_onnx.py +++ b/_unittests/ut_torch_models/test_tiny_llms_onnx.py @@ -110,8 +110,8 @@ def test_bypass_onnx_export_tiny_llm_official_full(self): self.assertEqual( {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs) ) - print("***", self.string_type(inputs, with_shape=True)) - print("---", type(model)) + print("--", self.string_type(inputs, with_shape=True)) + print("--", type(model)) with torch_export_patches( patch_transformers=True, verbose=1, stop_if_static=1 ) as modificator: diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py index eb156fb4..f6ba149e 100644 --- a/onnx_diagnostic/export/api.py +++ b/onnx_diagnostic/export/api.py @@ -16,6 +16,7 @@ def to_onnx( exporter: str = "onnx-dynamo", exporter_kwargs: Optional[Dict[str, Any]] = None, save_ep: Optional[str] = None, + optimize: bool = True, use_control_flow_dispatcher: bool = False, ) -> Any: """ @@ -37,6 +38,7 @@ def to_onnx( :param exporter: exporter to use (``onnx-dynamo``, ``modelbuilder``, ``custom``) :param exporter_kwargs: additional parameters sent to the exporter :param save_ep: saves the exported program + :param optimize: optimizes the model :param use_control_flow_dispatcher: use the dispatcher created to supported custom loops (see :func:`onnx_diagnostic.export.control_flow.loop_for`) :return: the output of the selected exporter, usually a structure including @@ -106,7 +108,8 @@ def to_onnx( dynamo=True, **(exporter_kwargs or {}), ) - ort_fusions.optimize_for_ort(epo.model) + if optimize: + ort_fusions.optimize_for_ort(epo.model) if filename: epo.save(filename, external_data=True) return epo diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py index 89c7c5a9..efb8e633 100644 --- a/onnx_diagnostic/ext_test_case.py +++ b/onnx_diagnostic/ext_test_case.py @@ -1188,6 +1188,7 @@ def assert_onnx_disc( copy_inputs: bool = True, expected: Optional[Any] = None, use_ort: bool = False, + ort_optimized_graph: bool = False, **kwargs, ): """ @@ -1206,6 +1207,7 @@ def assert_onnx_disc( :param expected: expected values :param copy_inputs: to copy the inputs :param use_ort: use :class:`onnxruntime.InferenceSession` + :param ort_optimized_graph: dumps the optimized onnxruntime graph :param kwargs: arguments sent to :class:`onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch` """ @@ -1216,29 +1218,50 @@ def assert_onnx_disc( kws = dict(with_shape=True, with_min_max=verbose > 1) vname = test_name or "assert_onnx_disc" if test_name: + import onnx + name = f"{test_name}.onnx" if verbose: print(f"[{vname}] save the onnx model into {name!r}") - name = self.dump_onnx(name, proto) + if isinstance(proto, str): + name = proto + proto = onnx.load(name) + else: + assert isinstance( + proto, onnx.ModelProto + ), f"Unexpected type {type(proto)} for proto" + name = self.dump_onnx(name, proto) if verbose: print(f"[{vname}] file size {os.stat(name).st_size // 2**10:1.3f} kb") if verbose: print(f"[{vname}] make feeds {string_type(inputs, **kws)}") if use_ort: + assert isinstance( + proto, onnx.ModelProto + ), f"Unexpected type {type(proto)} for proto" feeds = make_feeds(proto, inputs, use_numpy=True, copy=True) - if verbose: - print(f"[{vname}] feeds {string_type(feeds, **kws)}") import onnxruntime + if verbose: + print(f"[{vname}] create onnxruntime.InferenceSession") + options = onnxruntime.SessionOptions() + if ort_optimized_graph: + options.optimized_model_filepath = f"{name}.optort.onnx" sess = onnxruntime.InferenceSession( - proto.SerializeToString(), providers=["CPUExecutionProvider"] + proto.SerializeToString(), + options, + providers=kwargs.get("providers", ["CPUExecutionProvider"]), ) + if verbose: + print(f"[{vname}] run ort feeds {string_type(feeds, **kws)}") got = sess.run(None, feeds) else: feeds = make_feeds(proto, inputs, copy=True) if verbose: - print(f"[{vname}] feeds {string_type(feeds, **kws)}") + print(f"[{vname}] create InferenceSessionForTorch") sess = InferenceSessionForTorch(proto, **kwargs) + if verbose: + print(f"[{vname}] run orttorch feeds {string_type(feeds, **kws)}") got = sess.run(None, feeds) if verbose: print(f"[{vname}] compute expected values")