better report

xadupre · xadupre · commit 62db406e692a · 2025-06-11T14:56:55.000+02:00
diff --git a/_doc/api/reference/index.rst b/_doc/api/reference/index.rst
@@ -31,7 +31,7 @@ OnnxruntimeEvaluator
     :members:
 
 ReportResultsComparison
-++++++++++++++++++
++++++++++++++++++++++++
 
 .. autoclass:: onnx_diagnostic.reference.ReportResultsComparison
     :members:
diff --git a/_unittests/ut_reference/test_onnxruntime_evaluator.py b/_unittests/ut_reference/test_onnxruntime_evaluator.py
@@ -4,16 +4,23 @@
 import onnx.helper as oh
 import torch
 import onnxruntime
-from onnx_diagnostic.ext_test_case import ExtTestCase
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
 from onnx_diagnostic.helpers.onnx_helper import from_array_extended
-from onnx_diagnostic.reference import OnnxruntimeEvaluator, ExtendedReferenceEvaluator
+from onnx_diagnostic.reference import (
+    OnnxruntimeEvaluator,
+    ExtendedReferenceEvaluator,
+    ReportResultsComparison,
+)
 
 try:
     from experimental_experiment.torch_interpreter import to_onnx, ExportOptions
 except ImportError:
     to_onnx = None
 
 
+TFLOAT = onnx.TensorProto.FLOAT
+
+
 class TestOnnxruntimeEvaluator(ExtTestCase):
     def test_ort_eval_scan_cdist_add(self):
 
@@ -190,6 +197,35 @@ def test_ort_eval_loop(self):
         got = ref.run(None, feeds)
         self.assertEqualArray(expected, got[0])
 
+    @hide_stdout()
+    def test_report_results_comparison_ort(self):
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("Cos", ["X"], ["nx"]),
+                    oh.make_node("Sin", ["nx"], ["t"]),
+                    oh.make_node("Exp", ["t"], ["u"]),
+                    oh.make_node("Log", ["u"], ["uZ"]),
+                    oh.make_node("Erf", ["uZ"], ["Z"]),
+                ],
+                "dummy",
+                [oh.make_tensor_value_info("X", TFLOAT, ["a", "b"])],
+                [oh.make_tensor_value_info("Z", TFLOAT, ["a", "b"])],
+            ),
+            ir_version=9,
+            opset_imports=[oh.make_opsetid("", 18)],
+        )
+        x = torch.rand(5, 6, dtype=torch.float32)
+        onnx.checker.check_model(model)
+        cmp = ReportResultsComparison(dict(r_x=x, r_cos=x.cos(), r_exp=x.cos().sin().exp()))
+        cmp.clear()
+        feeds = dict(zip([i.name for i in model.graph.input], (x,)))
+        rt = OnnxruntimeEvaluator(model, verbose=10)
+        rt.run(None, feeds, report_cmp=cmp)
+        d = {k: d["abs"] for k, d in cmp.value.items()}
+        self.assertLess(d[(0, "nx"), "r_cos"], 1e-6)
+        self.assertLess(d[(2, "u"), "r_exp"], 1e-6)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_reference/test_torch_onnx_evaluator.py b/_unittests/ut_reference/test_torch_onnx_evaluator.py
@@ -1,5 +1,6 @@
 import unittest
 import numpy as np
+import pandas
 import onnx
 import onnx.helper as oh
 import onnx.numpy_helper as onh
@@ -1501,8 +1502,14 @@ def test_report_results_comparison(self):
         rt = TorchOnnxEvaluator(model, verbose=10)
         rt.run(None, feeds, report_cmp=cmp)
         d = {k: d["abs"] for k, d in cmp.value.items()}
-        self.assertEqual(d["nx", "r_cos"], 0)
-        self.assertEqual(d["u", "r_exp"], 0)
+        self.assertEqual(d[(0, "nx"), "r_cos"], 0)
+        self.assertEqual(d[(2, "u"), "r_exp"], 0)
+        data = cmp.data
+        self.assertIsInstance(data, list)
+        df = pandas.DataFrame(data)
+        piv = df.pivot(index=("run_index", "run_name"), columns="ref_name", values="abs")
+        self.assertEqual(list(piv.columns), ["r_cos", "r_exp", "r_x"])
+        self.assertEqual(list(piv.index), [(0, "nx"), (1, "t"), (2, "u"), (3, "uZ"), (4, "Z")])
 
 
 if __name__ == "__main__":
diff --git a/onnx_diagnostic/reference/ort_evaluator.py b/onnx_diagnostic/reference/ort_evaluator.py
@@ -22,8 +22,10 @@
     InferenceSessionForNumpy,
     _InferenceSession,
 )
+from .report_results_comparison import ReportResultsComparison
 from .evaluator import ExtendedReferenceEvaluator
 
+
 PROTO = (FunctionProto, ModelProto, GraphProto, NodeProto)
 Proto = Union[FunctionProto, ModelProto, GraphProto, NodeProto]
 
@@ -214,16 +216,21 @@ def run(
         outputs: Optional[List[str]],
         feed_inputs: Dict[str, Any],
         intermediate: bool = False,
+        report_cmp: Optional[ReportResultsComparison] = None,
     ) -> Union[Dict[str, Any], List[Any]]:
         """
-        Runs the model.
-        It only works with numpy arrays.
-
-        :param outputs: required outputs or None for all
-        :param feed_inputs: inputs
-        :param intermediate: returns all output instead of the last ones
+         Runs the model.
+         It only works with numpy arrays.
+
+         :param outputs: required outputs or None for all
+         :param feed_inputs: inputs
+         :param intermediate: returns all output instead of the last ones
+         :param report_cmp: used as a reference,
+             every intermediate results is compare to every existing one,
+             if not empty, it is an instance of
+             :class:`onnx_diagnostic.reference.ReportResultsComparison`
         :return: outputs, as a list if return_all is False,
-            as a dictionary if return_all is True
+             as a dictionary if return_all is True
         """
         if self.rt_nodes_ is None:
             # runs a whole
@@ -267,6 +274,10 @@ def run(
                 self._log(2, " + %s: %s", name, value)  # type: ignore[arg-type]
                 assert isinstance(name, str), f"unexpected type for name {type(name)}"
                 results[name] = value
+            if report_cmp:
+                reported = report_cmp.report(dict(zip(node.output, outputs)))
+                if self.verbose > 1:
+                    print(f"  -- report {len(reported)} comparisons")
             if not intermediate:
                 self._clean_unused_inplace(i_node, node, results)
 
diff --git a/onnx_diagnostic/reference/report_results_comparison.py b/onnx_diagnostic/reference/report_results_comparison.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 
 ReportKeyNameType = Union[str, Tuple[str, int, str]]
@@ -9,7 +9,7 @@ class ReportResultsComparison:
     """
     Holds tensors a runtime can use as a reference to compare
     intermediate results.
-    See :meth:`onnx_diagnostic.reference.TorchOnnxEvalutor.run`.
+    See :meth:`onnx_diagnostic.reference.TorchOnnxEvaluator.run`.
 
     :param tensors: tensor
     """
@@ -40,23 +40,37 @@ def _build_mapping(self):
     def clear(self):
         """Clears the last report."""
         self.report_cmp = {}
+        self.unique_run_names = set()
 
     @property
     def value(self) -> Dict[Tuple[str, ReportKeyNameType], Dict[str, Union[float, str]]]:
         "Returns the report."
         return self.report_cmp
 
+    @property
+    def data(self) -> List[Dict[str, Any]]:
+        "Returns data which can be consumed by a dataframe."
+        rows = []
+        for k, v in self.value.items():
+            (i_run, run_name), ref_name = k
+            d = dict(run_index=i_run, run_name=run_name, ref_name=ref_name)
+            d.update(v)
+            rows.append(d)
+        return rows
+
     def report(
         self, outputs: Dict[str, "torch.Tensor"]  # noqa: F821
-    ) -> List[Tuple[str, ReportKeyNameType, Dict[str, Union[float, str]]]]:
+    ) -> List[Tuple[Tuple[int, str], ReportKeyNameType, Dict[str, Union[float, str]]]]:
         """
         For every tensor in outputs, compares it to every tensor held by
         this class if it shares the same type and shape. The function returns
         the results of the comparison. The function also collects the results
         into a dictionary the user can retrieve later.
         """
-        res: List[Tuple[str, ReportKeyNameType, Dict[str, Union[float, str]]]] = []
+        res: List[Tuple[Tuple[int, str], ReportKeyNameType, Dict[str, Union[float, str]]]] = []
         for name, tensor in outputs.items():
+            i_run = len(self.unique_run_names)
+            self.unique_run_names.add(name)
             key = self.key(tensor)
             if key not in self.mapping:
                 continue
@@ -71,6 +85,6 @@ def report(
                     diff = self.max_diff(t, t2)
                 else:
                     diff = self.max_diff(tensor, t2)
-                res.append((name, held_key, diff))  # type: ignore[arg-type]
-                self.report_cmp[name, held_key] = diff
+                res.append((i_run, name, held_key, diff))  # type: ignore[arg-type]
+                self.report_cmp[(i_run, name), held_key] = diff
         return res