Adds discrepancies with the exporter program (#307)

xadupre · web-flow · commit 9c9bf006fc6a · 2025-11-17T17:02:16.000+01:00
* Adds discrepancies with the exporter program

* better
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -115,6 +115,15 @@ def _config_reduction(config, task):
             verbose=1,
             stop_if_static=2,
         ):
+            if exporter == "onnx-dynamo":
+                # The exported program in ONNXProgram cannot be restored.
+                ep2 = torch.export.export(
+                    model.visual,
+                    (),
+                    kwargs=export_inputs,
+                    dynamic_shapes=self.use_dyn_not_str(dynamic_shapes),
+                )
+                torch.export.save(ep2, f"{fileep}.backup.pt2")
             to_onnx(
                 model.visual,
                 kwargs=export_inputs,
@@ -127,6 +136,14 @@ def _config_reduction(config, task):
                 optimize=True,
             )
 
+        pt2_files = [f"{fileep}.backup.pt2", f"{fileep}.ep.pt2", f"{fileep}.pt2"]
+        pt2_file = [f for f in pt2_files if os.path.exists(f)]
+        assert pt2_file, f"Unable to find an existing file among {pt2_files}"
+        pt2_file = pt2_file[0]
+        # self.assertExists(pt2_file)
+        # ep = torch.export.load(pt2_file)
+        # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
+        # print("----------- diff", diff)
         self.assert_onnx_disc(
             f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}",
             filename,
@@ -142,6 +159,7 @@ def _config_reduction(config, task):
             atol=0.02,
             rtol=10,
             ort_optimized_graph=False,
+            ep=pt2_file,
         )
 
 
diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -112,6 +112,10 @@ def to_onnx(
             ort_fusions.optimize_for_ort(epo.model)
         if filename:
             epo.save(filename, external_data=True)
+        if save_ep:
+            if isinstance(save_ep, tuple):
+                save_ep = save_ep[0]
+            torch.export.save(epo.exported_program, f"{save_ep}.pt2")
         return epo
 
     if exporter == "modelbuilder":
diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
@@ -1199,6 +1199,7 @@ def assert_onnx_disc(
         expected: Optional[Any] = None,
         use_ort: bool = False,
         ort_optimized_graph: bool = False,
+        ep: Optional[Union["torch.export.ExportedProgram", str]] = None,  # noqa: F821
         **kwargs,
     ):
         """
@@ -1218,6 +1219,7 @@ def assert_onnx_disc(
         :param copy_inputs: to copy the inputs
         :param use_ort: use :class:`onnxruntime.InferenceSession`
         :param ort_optimized_graph: dumps the optimized onnxruntime graph
+        :param ep: exported program (or saved exported program)
         :param kwargs: arguments sent to
             :class:`onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch`
         """
@@ -1245,6 +1247,7 @@ def assert_onnx_disc(
                 print(f"[{vname}] file size {os.stat(name).st_size // 2**10:1.3f} kb")
         if verbose:
             print(f"[{vname}] make feeds {string_type(inputs, **kws)}")
+
         if use_ort:
             assert isinstance(
                 proto, onnx.ModelProto
@@ -1275,6 +1278,7 @@ def assert_onnx_disc(
             got = sess.run(None, feeds)
         if verbose:
             print(f"[{vname}] compute expected values")
+
         if expected is None:
             if copy_inputs:
                 expected = (
@@ -1284,9 +1288,45 @@ def assert_onnx_disc(
                 )
             else:
                 expected = model(*inputs) if isinstance(inputs, tuple) else model(**inputs)
+
         if verbose:
             print(f"[{vname}] expected {string_type(expected, **kws)}")
             print(f"[{vname}] obtained {string_type(got, **kws)}")
+
+        if ep:
+            if isinstance(ep, str):
+                if verbose:
+                    print(f"[{vname}] load exported program {ep!r}")
+                import torch
+
+                ep = torch.export.load(ep)
+            ep_inputs = copy.deepcopy(inputs) if copy_inputs else inputs
+            ep_model = ep.module()  # type: ignore[union-attr]
+            ep_expected = (
+                ep_model(*copy.deepcopy(ep_inputs))
+                if isinstance(ep_inputs, tuple)
+                else ep_model(**copy.deepcopy(ep_inputs))
+            )
+            if verbose:
+                print(f"[{vname}] ep_expected {string_type(ep_expected, **kws)}")
+            ep_diff = max_diff(expected, ep_expected)
+            if verbose:
+                print(f"[{vname}] ep_diff {string_diff(ep_diff)}")
+            assert (
+                isinstance(ep_diff["abs"], float)
+                and isinstance(ep_diff["rel"], float)
+                and not numpy.isnan(ep_diff["abs"])
+                and ep_diff["abs"] <= atol
+                and not numpy.isnan(ep_diff["rel"])
+                and ep_diff["rel"] <= rtol
+            ), (
+                f"discrepancies in {test_name!r} between the model "
+                f"and the exported model diff={string_diff(ep_diff)}"
+            )
+            ep_nx_diff = max_diff(ep_expected, got, flatten=True)
+            if verbose:
+                print(f"[{vname}] ep_nx_diff {string_diff(ep_nx_diff)}")
+
         diff = max_diff(expected, got, flatten=True)
         if verbose:
             print(f"[{vname}] diff {string_diff(diff)}")
@@ -1297,7 +1337,10 @@ def assert_onnx_disc(
             and diff["abs"] <= atol
             and not numpy.isnan(diff["rel"])
             and diff["rel"] <= rtol
-        ), f"discrepancies in {test_name!r}, diff={string_diff(diff)}"
+        ), (
+            f"discrepancies in {test_name!r} between the model and "
+            f"the onnx model diff={string_diff(diff)}"
+        )
 
     def _debug(self):
         "Tells if DEBUG=1 is set up."
@@ -1308,6 +1351,16 @@ def string_type(self, *args, **kwargs):
 
         return string_type(*args, **kwargs)
 
+    def max_diff(self, *args, **kwargs):
+        from .helpers import max_diff
+
+        return max_diff(*args, **kwargs)
+
+    def use_dyn_not_str(self, *args, **kwargs):
+        from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+
+        return use_dyn_not_str(*args, *kwargs)
+
     def subloop(self, *args, verbose: int = 0):
         "Loops over elements and calls :meth:`unittests.TestCase.subTest`."
         if len(args) == 1:
diff --git a/onnx_diagnostic/helpers/ort_session.py b/onnx_diagnostic/helpers/ort_session.py
@@ -134,7 +134,13 @@ def __init__(
 
         self.sess = sess
         self.input_names = [i.name for i in sess.get_inputs()]
+        assert (
+            "" not in self.input_names
+        ), f"Input name cannot be empty but input_names={self.input_names}"
         self.output_names = [i.name for i in sess.get_outputs()]
+        assert (
+            "" not in self.input_names
+        ), f"Output name cannot be empty but output_names={self.output_names}"
         self.input_shapes = [i.shape for i in sess.get_inputs()]
         self.output_shapes = [i.shape for i in sess.get_outputs()]
         self.input_types = [i.type for i in sess.get_inputs()]
@@ -497,6 +503,7 @@ def run_dlpack(
         values = ORTC.OrtValueVector()
         device = -1
         for k, v in feeds.items():
+            assert k != "", f"Input cannot be empty but feeds names={list(feeds)}"
             device = max(device, v.get_device())
             assert hasattr(v, "__dlpack__"), f"class {type(v)} should be serialized"
             if not v.is_contiguous():
diff --git a/onnx_diagnostic/reference/ort_evaluator.py b/onnx_diagnostic/reference/ort_evaluator.py
@@ -564,18 +564,14 @@ def _run(self, node: NodeProto, inputs: List[Any], results: Dict[str, Any]) -> L
             onx, sess = self._get_sess(node, inputs)
             self._cache[key] = onx, sess
 
-        feeds = dict(zip(node.input, inputs))
-        if "" in feeds:
-            cls = None
-            for k, v in feeds.items():
-                if k != "":
-                    cls = v.__class__
-                    break
-            assert (
-                cls is not None
-            ), f"Unable to get input class (array or tensor), feeds={string_type(feeds)}"
-            feeds[""] = cls([0])
-
+        feeds = {}
+        for i, val in zip(node.input, inputs):
+            if i == "":
+                assert (
+                    val is None
+                ), f"input name={i!r} but val={string_type(val, with_shape=True)}"
+                continue
+            feeds[i] = val
         assert hasattr(sess, "run"), f"Missing method run for type {type(sess)}"
         outputs = list(sess.run(None, feeds))
         assert isinstance(outputs, list), f"Unexpected type for outputs {type(outputs)}"
diff --git a/onnx_diagnostic/torch_onnx/sbs.py b/onnx_diagnostic/torch_onnx/sbs.py
@@ -567,10 +567,11 @@ def _loop_cmp(
             print(f"[run_aligned-nx] +inp: {inp.name}: {string_type(v, **str_kws)}")
 
     placeholders = {node.name for node in ep.graph.nodes if node.op == "placeholder"}
-    ep_state_dict = {**ep.state_dict, **dict(ep.named_buffers())}
+    ep_state_dict = {**ep.state_dict, **dict(ep.named_buffers(), **ep.tensor_constants)}
     placeholders_to_state_dict = {
         **{f"p_{name.replace('.', '_')}": name for name in ep.state_dict},
         **{f"b_{name.replace('.', '_')}": name for name, _ in ep.named_buffers()},
+        **{f"c_{name.replace('.', '_')}": name for name in ep.tensor_constants},
     }
     for n in onnx_results:
         if n not in placeholders:
@@ -588,6 +589,7 @@ def _loop_cmp(
     else:
         loop = list(enumerate(ep_graph_nodes))
 
+    already_run = set()
     ep_durations = {}
     yielded_nodes = 0
     max_abs = 0
@@ -641,8 +643,8 @@ def _loop_cmp(
                 yield record
             else:
                 assert node.name in placeholders_to_state_dict, (
-                    f"Unable to find placeholder {node.name!r} in "
-                    f"{sorted(placeholders_to_state_dict)}"
+                    f"Unable to find placeholder {node.name!r} (node.op={node.op!r}), "
+                    f"existing: {sorted(placeholders_to_state_dict)}"
                 )
                 torch_results[node.name] = ep_state_dict[placeholders_to_state_dict[node.name]]
                 if verbose > 1:
@@ -683,6 +685,8 @@ def _loop_cmp(
             continue
 
         for i_onnx in range(last_position, max_pos + 1):
+            if i_onnx in already_run:
+                continue
             node = onx.graph.node[i_onnx]
             if verbose > 1:
                 print(
@@ -695,9 +699,16 @@ def _loop_cmp(
                     f"mapped {yielded_nodes} maxabs {max_abs:1.5f}"
                 )
             ref = run_cls(node, **run_cls_kwargs)
-            feeds = {k: onnx_results[k] for k in node.input}
+            feeds = {k: onnx_results[k] for k in node.input if k}
+            assert "" not in feeds, f"Unexpected feeds={string_type(feeds, **str_kws)}"
             begin = time.perf_counter()
-            res = ref.run(None, feeds)  # type: ignore[attr-defined]
+            try:
+                res = ref.run(None, feeds)  # type: ignore[attr-defined]
+            except Exception as e:
+                raise RuntimeError(
+                    f"Unable to run node {node.op_type}, domain={node.domain} "
+                    f"with inputs={node.input}, feeds={string_type(feeds, **str_kws)}"
+                ) from e
             duration = time.perf_counter() - begin
             assert (
                 not has_cuda
@@ -748,6 +759,7 @@ def _loop_cmp(
                     if tmp.err_abs is not None:
                         max_abs = max(max_abs, tmp.err_abs)
                     yield tmp
+            already_run.add(i_onnx)
 
         last_position = max_pos + 1
 
@@ -758,14 +770,17 @@ def _loop_cmp(
             f"to {len(onx.graph.node)}"
         )
     for i_onnx in range(last_position, len(onx.graph.node)):
+        if i_onnx in already_run:
+            continue
         node = onx.graph.node[i_onnx]
         if verbose > 1:
             print(
                 f"[run_aligned] run onx.graph.node[{i_onnx}]: "
                 f"{node.op_type}({', '.join(node.input)}) -> {', '.join(node.output)}"
             )
         ref = run_cls(node, **run_cls_kwargs)
-        feeds = {k: onnx_results[k] for k in node.input}
+        feeds = {k: onnx_results[k] for k in node.input if k}
+        assert "" not in feeds, f"Unexpected feeds={string_type(feeds, **str_kws)}"
         begin = time.perf_counter()
         res = ref.run(None, feeds)  # type: ignore[attr-defined]
         duration = time.perf_counter() - begin
@@ -800,6 +815,8 @@ def _loop_cmp(
                 if tmp.err_abs is not None:
                     max_abs = max(max_abs, tmp.err_abs)
                 yield tmp
+        already_run.add(i_onnx)
+
     if verbose:
         print(f"[run_aligned] done with {yielded_nodes} mapped nodes")
         print(f"[run_aligned] max absolution error={max_abs}")