Handles parameters in OnnxruntimeEvaluator for FunctionProto (#324)

xadupre · sdpython · web-flow · commit 5842d50a96fd · 2025-11-26T15:54:49.000+01:00
* update ci

* freeze

* patch

* better with cache

* shorten names

* few changes

* ut

* Handles parameters in OnnxruntimeEvaluator for FunctionProto

* final

---------

Co-authored-by: xavier dupré &lt;sdpython@users.noreply.github.com&gt;
diff --git a/.github/workflows/models.yml b/.github/workflows/models.yml
@@ -61,5 +61,5 @@ jobs:
         run: python -m pip freeze
 
       - name: qwen2.5_vl_instruct
-        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen_2_5_vli_visual
+        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen25_vli_visual
         
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.8.3
 +++++
 
+* :pr:`324`: supports FunctionProto with arguments in OnnxruntimeEvaluator
 * :pr:`323`: drops torch 2.8 on CI
 * :pr:`322`: support rerunning onnx kernels with torch intermediate results in side-by-side
 * :pr:`314`: fix modelbuilder download needed after this change https://github.com/microsoft/onnxruntime-genai/pull/1862
diff --git a/_unittests/ut_reference/test_onnxruntime_evaluator.py b/_unittests/ut_reference/test_onnxruntime_evaluator.py
@@ -284,6 +284,41 @@ def test_skip_simplified_layer_normalization(self):
         self.assertEqual(got[1].shape, feeds["x"].shape)
         self.assertEqual(got[1].dtype, feeds["x"].dtype)
 
+    def test_function_proto_with_kwargs(self):
+        linear_function = oh.make_function(
+            "test_domain",
+            "LinearRegression",
+            ["x", "a", "b"],
+            ["y"],
+            [
+                oh.make_node("Constant", [], ["eps"]),
+                oh.make_node("Constant", [], ["zero"], value_ints=[0]),
+                oh.make_node("Unsqueeze", ["eps", "zero"], ["eps1d"]),
+                oh.make_node("MatMul", ["x", "a"], ["xa"]),
+                oh.make_node("Add", ["b", "eps1d"], ["beps"]),
+                oh.make_node("Add", ["xa", "beps"], ["y"]),
+            ],
+            [oh.make_opsetid("", 14)],
+            ["epsilon"],
+        )
+        att = onnx.AttributeProto()
+        att.name = "value_float"
+        att.ref_attr_name = "epsilon"
+        att.type = onnx.AttributeProto.FLOAT
+        linear_function.node[0].attribute.append(att)
+        feeds = dict(
+            x=np.random.rand(4, 4).astype(np.float32),
+            a=np.random.rand(4, 2).astype(np.float32),
+            b=np.random.rand(1, 2).astype(np.float32),
+        )
+        epsilon = 15.6
+        expected = feeds["x"] @ feeds["a"] + feeds["b"] + epsilon
+        sess = OnnxruntimeEvaluator(
+            linear_function, whole=True, function_kwargs=dict(epsilon=epsilon)
+        )
+        got = sess.run(None, feeds)
+        self.assertEqualArray(expected, got[0], atol=1e-5)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -185,24 +185,25 @@ def _config_reduction(config, task):
                         onnx_plugs=PLUGS,
                     )
 
-                with open(
-                    self.get_dump_file(
-                        f"sbs_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.sh"
-                    ),
-                    "w",
-                ) as f:
-                    f.write(
-                        textwrap.dedent(
-                            f"""
-                            clear&&python -m onnx_diagnostic sbs \\
-                                -i qwen25_vli_visual.inputs.pt \\
-                                -e test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.graph.ep.pt2 \\
-                                -m test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.onnx \\
-                                -o test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.xlsx \\
-                                -v 1 --atol 0.1 --rtol 1000
-                            """
+                if not self.unit_test_going():
+                    with open(
+                        self.get_dump_file(
+                            f"sbs_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.sh"
+                        ),
+                        "w",
+                    ) as f:
+                        f.write(
+                            textwrap.dedent(
+                                f"""
+                                clear&&python -m onnx_diagnostic sbs \\
+                                    -i qwen25_vli_visual.inputs.pt \\
+                                    -e test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.graph.ep.pt2 \\
+                                    -m test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.onnx \\
+                                    -o test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.xlsx \\
+                                    -v 1 --atol 0.1 --rtol 1000
+                                """
+                            )
                         )
-                    )
                 print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
                 model = onnx.load(filename, load_external_data=False)
                 if attention == "PACKED":
@@ -226,11 +227,13 @@ def _config_reduction(config, task):
                 assert (
                     self.unit_test_going() or pt2_files
                 ), f"Unable to find an existing file among {pt2_files!r}"
-                pt2_file = (
-                    (pt2_files[0] if pt2_files else None)
-                    if not self.unit_test_going()
-                    else None
-                )
+
+                # pt2_file = (
+                #    (pt2_files[0] if pt2_files else None)
+                #    if not self.unit_test_going()
+                #    else None
+                # )
+
                 # self.assertExists(pt2_file)
                 # ep = torch.export.load(pt2_file)
                 # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
@@ -250,8 +253,7 @@ def _config_reduction(config, task):
                     use_ort=True,
                     atol=0.02,
                     rtol=10,
-                    ort_optimized_graph=False,
-                    ep=pt2_file,
+                    # ep=pt2_file,
                     expected=expected,
                 )
                 print(f"-- MODEL VERIFIED IN {time.perf_counter() - begin}")
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -6,6 +6,7 @@
 import onnx_diagnostic.torch_export_patches.patches.patch_transformers as patch_transformers
 from onnx_diagnostic.ext_test_case import (
     ExtTestCase,
+    requires_cuda,
     requires_transformers,
     requires_torch,
     ignore_warnings,
@@ -518,6 +519,71 @@ def test_qwen2_5_vl_vision_attention_iteration(self):
             )
         self.clean_dump()
 
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    @requires_cuda()
+    def test_plug_packed_multi_head_attention_qwen25(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            qwen_sdpa_attention_versatile,
+        )
+
+        inputs = (
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.tensor(
+                [
+                    0,
+                    64,
+                    128,
+                    192,
+                    256,
+                    304,
+                    368,
+                    432,
+                    496,
+                    560,
+                    608,
+                    672,
+                    736,
+                    800,
+                    864,
+                    912,
+                    976,
+                    1040,
+                    1104,
+                    1168,
+                    1216,
+                    1232,
+                    1248,
+                    1264,
+                    1280,
+                    1292,
+                ],
+                dtype=torch.int64,
+            ).to("cuda"),
+        )
+
+        results = qwen_sdpa_attention_versatile.verify(
+            *inputs,
+            scaling=0.5,
+            num_heads=16,
+            dump_onnx_model=self.get_dump_file(
+                "test_plug_packed_multi_head_attention_qwen25.onnx"
+            ),
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=0.01)
+        self.assertLess(results.diffs[0]["abs"], 0.01)
+
+        results = qwen_sdpa_attention_versatile.verify(
+            *inputs, scaling=0.11180339887498948, num_heads=16
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=0.01)
+        self.assertLess(results.diffs[0]["abs"], 0.01)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/export/onnx_plug.py b/onnx_diagnostic/export/onnx_plug.py
@@ -27,7 +27,7 @@ class VerifyResult:
     """
 
     eager_outputs: TUPLE_TENSORS
-    onnx_output: TUPLE_TENSORS
+    onnx_outputs: TUPLE_TENSORS
     diffs: Tuple[Dict[str, float], ...]
 
 
@@ -238,20 +238,30 @@ def _register(self):
         custom_def.register_kernel(None)(self.eager_fn)
         custom_def._abstract_fn = self.shape_fn
 
-    def verify(self, *args, engine: Optional[Callable] = None) -> VerifyResult:
+    def verify(
+        self,
+        *args,
+        engine: Optional[Callable] = None,
+        dump_onnx_model: Optional[str] = None,
+        **kwargs,
+    ) -> VerifyResult:
         """
         Verifies that the eager mode is equivalent to the onnx function given
         as a replacements. This function evaluates `eager_fn`, checks that the shapes
         are equivalent to the ones given by `shape_fn`, and finally evaluates the
         onnx translation if the previous did not fail.
 
         :param args: function inputs
+        :param kwargs: arguments for eager_fn
         :param engine: by default an instance of
             :class:`onnx_diagnostic.reference.OnnxruntimeEvaluator`.
+        :param dump_onnx_model: to dump the onnx model used to verify
+            eager and onnx produce the same results
+        :param kwargs: additional arguments to the function
         :return: outputs of :func:`onnx_diagnostic.helpers.max_diff`
         """
-        expected = self.eager_fn(*args)
-        shapes = self.shape_fn(*args)
+        expected = self.eager_fn(*args, **kwargs)
+        shapes = self.shape_fn(*args, **kwargs)
         if isinstance(expected, torch.Tensor):
             expected = (expected,)
             assert isinstance(shapes, torch.Tensor), (
@@ -279,11 +289,23 @@ def verify(self, *args, engine: Optional[Callable] = None) -> VerifyResult:
 
         # Now the ONNX execution.
         assert engine is None, f"Not implemented yet with engine={engine!r}"
-        sess = OnnxruntimeEvaluator(self.function_proto)
-        feeds = dict(zip(sess.input_names, args))
+        ags, kws = self._make_args_kwargs(*args, **kwargs)
+        sess = OnnxruntimeEvaluator(
+            self.function_proto,
+            whole=True,
+            dump_onnx_model=dump_onnx_model,
+            function_kwargs=kws,
+        )
+        feeds = dict(zip(sess.input_names, ags))
         got = sess.run(None, feeds)
-        diffs = tuple(max_diff(e, g) for e, g in zip(expected, got))
-        return VerifyResult(eager_outputs=expected, onnx_output=tuple(got), diffs=diffs)  # type: ignore[arg-type]
+        diffs = tuple(max_diff(e, g, hist=[0.1, 0.01]) for e, g in zip(expected, got))
+        return VerifyResult(eager_outputs=expected, onnx_outputs=tuple(got), diffs=diffs)  # type: ignore[arg-type]
+
+    def _make_args_kwargs(self, *args, **kwargs):
+        ags = args[: len(self.args_name)]
+        kws = dict(zip(self.kwargs_name, args[len(self.args_name) :]))
+        kws.update(kwargs)
+        return ags, kws
 
     def custom_converter(
         self,
@@ -306,9 +328,7 @@ def converter(
                 self.function_proto.name, domain=self.function_proto.domain
             ):
                 g.add_function(self.function_proto)
-            ags = args[: len(self.args_name)]
-            kws = dict(zip(self.kwargs_name, args[len(self.args_name) :]))
-            kws.update(kwargs)
+            ags, kws = self._make_args_kwargs(*args, **kwargs)
             res = g.make_node(
                 self.function_proto.name,
                 ags,
@@ -369,7 +389,8 @@ def onnx_dynamo_converter(self) -> Callable:
             onnx.defs.register_schema(schema)
         op = onnxscript.values.Op(onnx_plug_op, self.function_proto.name, schema)
 
-        def converter(*cargs):
-            return op(*cargs, n_outputs=self.n_outputs)
+        def converter(*cargs, **ckwargs):
+            ags, kws = self._make_args_kwargs(*cargs, **ckwargs)
+            return op(*ags, n_outputs=self.n_outputs, **kws)
 
         return onnxscript.values.TracedOnnxFunction(onnx_plug_op, converter)
diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
@@ -1323,7 +1323,7 @@ def assert_onnx_disc(
                 and not numpy.isnan(ep_diff["rel"])
                 and ep_diff["rel"] <= rtol
             ), (
-                f"discrepancies in {test_name!r} between the model "
+                f"discrepancies in {test_name!r} between the exported program "
                 f"and the exported model diff={string_diff(ep_diff)}"
             )
             ep_nx_diff = max_diff(ep_expected, got, flatten=True, hist=[0.1, 0.01])
diff --git a/onnx_diagnostic/reference/ort_evaluator.py b/onnx_diagnostic/reference/ort_evaluator.py

Original file line number	Diff line number	Diff line change
`@@ -1323,7 +1323,7 @@ def assert_onnx_disc(`
`1323`	`1323`	`and not numpy.isnan(ep_diff["rel"])`
`1324`	`1324`	`and ep_diff["rel"] <= rtol`
`1325`	`1325`	`), (`
`1326`		`- f"discrepancies in {test_name!r} between the model "`
	`1326`	`+ f"discrepancies in {test_name!r} between the exported program "`
`1327`	`1327`	`f"and the exported model diff={string_diff(ep_diff)}"`
`1328`	`1328`	`)`
`1329`	`1329`	`ep_nx_diff = max_diff(ep_expected, got, flatten=True, hist=[0.1, 0.01])`