diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0bd23b68..9dca38dc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,12 +56,19 @@ jobs:
         run: |
           pip install pytest
           export PYTHONPATH=.
-          UNITTEST_GOING=1 pytest --durations=10 _unittests --ignore _unittests/ut_reference/test_backend_extended_reference_evaluator.py
+          UNITTEST_GOING=1 pytest --durations=10 _unittests --ignore _unittests/ut_reference/test_backend_extended_reference_evaluator.py --ignore _unittests/ut_reference/test_backend_onnxruntime_evaluator.py
           export PYTHONPATH=
 
-      - name: run backend tests
+      - name: run backend tests python
         run: |
           pip install pytest
           export PYTHONPATH=.
           UNITTEST_GOING=1 pytest --durations=10 _unittests/ut_reference/test_backend_extended_reference_evaluator.py
           export PYTHONPATH=
+
+      - name: run backend tests onnxruntime
+        run: |
+          pip install pytest
+          export PYTHONPATH=.
+          UNITTEST_GOING=1 pytest --durations=10 _unittests/ut_reference/test_backend_onnxruntime_evaluator.py --maxfail=15
+          export PYTHONPATH=
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index b99a1372..d3dfbaae 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -59,7 +59,7 @@ jobs:
           pip install pytest
           pip install pytest-cov
           export PYTHONPATH=.
-          UNITTEST_GOING=1 pytest --cov=./onnx_diagnostic/ --cov-report=xml --durations=10 _unittests --ignore _unittests/ut_reference/test_backend_extended_reference_evaluator.py
+          UNITTEST_GOING=1 pytest --cov=./onnx_diagnostic/ --cov-report=xml --durations=10 _unittests --ignore _unittests/ut_reference/test_backend_extended_reference_evaluator.py  --ignore _unittests/ut_reference/test_backend_onnxruntime_evaluator.py
           export PYTHONPATH=
 
       - name: Upload coverage reports to Codecov
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index 9fa3adb0..42e207cd 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.2.0
 +++++
 
+* :pr:`9`: adds ``OnnxruntimeEvaluator``
 * :pr:`8`: adds ``ExtendedReferenceEvaluator``
 * :pr:`7`: improves function ``investigate_onnxruntime_issue``
 
diff --git a/README.rst b/README.rst
index 5a797b82..bb2bad34 100644
--- a/README.rst
+++ b/README.rst
@@ -11,10 +11,6 @@ onnx-diagnostic: investigate onnx models
 .. image:: https://badge.fury.io/py/onnx-diagnostic.svg
     :target: http://badge.fury.io/py/onnx-diagnostic
 
-.. image:: http://img.shields.io/github/issues/sdpython/onnx-diagnostic.png
-    :alt: GitHub Issues
-    :target: https://github.com/sdpython/onnx-diagnostic/issues
-
 .. image:: https://img.shields.io/badge/license-MIT-blue.svg
     :alt: MIT License
     :target: https://opensource.org/license/MIT/
diff --git a/_doc/api/reference/index.rst b/_doc/api/reference/index.rst
index 0f617e77..1bd03683 100644
--- a/_doc/api/reference/index.rst
+++ b/_doc/api/reference/index.rst
@@ -13,6 +13,7 @@ onnx_diagnostic.reference
 
     evaluator
     quantized_tensor
+    ort_evaluator
 
 ExtendedReferenceEvaluator
 ++++++++++++++++++++++++++
@@ -20,6 +21,12 @@ ExtendedReferenceEvaluator
 .. autoclass:: onnx_diagnostic.reference.ExtendedReferenceEvaluator
     :members:
 
+OnnxruntimeEvaluator
+++++++++++++++++++++
+
+.. autoclass:: onnx_diagnostic.reference.OnnxruntimeEvaluator
+    :members:
+
 Other functions
 +++++++++++++++
 
diff --git a/_doc/api/reference/ort_evaluator.rst b/_doc/api/reference/ort_evaluator.rst
new file mode 100644
index 00000000..9b161bc8
--- /dev/null
+++ b/_doc/api/reference/ort_evaluator.rst
@@ -0,0 +1,8 @@
+
+onnx_diagnostic.reference.ort_evaluator
+=======================================
+
+.. automodule:: onnx_diagnostic.reference.ort_evaluator
+    :members:
+    :no-undoc-members:
+    :exclude-members: OnnxruntimeEvaluator
diff --git a/_doc/conf.py b/_doc/conf.py
index e46703f1..f5503109 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -104,11 +104,12 @@
     ("py:class", "False"),
     ("py:class", "True"),
     ("py:class", "Argument"),
-    ("py:class", "onnxscript.ir.Tuple"),
-    ("py:class", "pipeline.Pipeline"),
     ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"),
     ("py:class", "ModelProto"),
     ("py:class", "Module"),
+    ("py:class", "np.ndarray"),
+    ("py:class", "onnxscript.ir.Tuple"),
+    ("py:class", "pipeline.Pipeline"),
     ("py:class", "torch.fx.passes.operator_support.OperatorSupport"),
     ("py:class", "torch.fx.proxy.TracerBase"),
     ("py:class", "torch.utils._pytree.Context"),
@@ -177,6 +178,7 @@
     "GraphModule": "https://pytorch.org/docs/stable/fx.html#torch.fx.GraphModule",
     "HuggingFace": "https://huggingface.co/docs/hub/en/index",
     "Linux": "https://www.linux.org/",
+    "ml_dtypes": "https://github.com/jax-ml/ml_dtypes",
     "monai": "https://monai.io/",
     "numpy": "https://numpy.org/",
     "onnx": "https://onnx.ai/onnx/",
@@ -186,6 +188,7 @@
     "onnxrt backend": "https://pytorch.org/docs/stable/onnx_dynamo_onnxruntime_backend.html",
     "onnxruntime": "https://onnxruntime.ai/",
     "onnxruntime-training": "https://onnxruntime.ai/docs/get-started/training-on-device.html",
+    "onnxruntime kernels": "https://onnxruntime.ai/docs/reference/operators/OperatorKernels.html",
     "onnx-array-api": "https://sdpython.github.io/doc/onnx-array-api/dev/",
     "onnx-diagnostic": "https://sdpython.github.io/doc/onnx-diagnostic/dev/",
     "onnx-extended": "https://sdpython.github.io/doc/onnx-extended/dev/",
diff --git a/_doc/examples/plot_failing_onnxruntime_evaluator.py b/_doc/examples/plot_failing_onnxruntime_evaluator.py
new file mode 100644
index 00000000..be274919
--- /dev/null
+++ b/_doc/examples/plot_failing_onnxruntime_evaluator.py
@@ -0,0 +1,106 @@
+"""
+.. _l-plot-failing-onnxruntime-evaluator:
+
+Running OnnxruntimeEvaluator on a failing model
+===============================================
+
+Example :ref:`l-plot-failing-reference-evaluator` demonstrated
+how to run a python runtime on a model but it may very slow sometimes
+and it could show some discrepancies if the only provider is not CPU.
+Let's use :class:`OnnxruntimeEvaluator <onnx_diagnostic.reference.OnnxruntimeEvaluator>`.
+It splits the model into node and runs them independantly until it succeeds
+or fails. This class converts every node into model based on the types
+discovered during the execution. It relies on :class:`InferenceSessionForTorch
+<onnx_diagnostic.ort_session.InferenceSessionForTorch>` or
+:class:`InferenceSessionForNumpy
+<onnx_diagnostic.ort_session.InferenceSessionForNumpy>`
+for the execution. This example uses torch tensor and
+bfloat16.
+
+A failing model
++++++++++++++++
+
+The issue here is a an operator ``Cast`` trying to convert a result
+into a non-existing type.
+"""
+
+import onnx
+import onnx.helper as oh
+import torch
+import onnxruntime
+from onnx_diagnostic.ext_test_case import has_cuda
+from onnx_diagnostic.helpers import from_array_extended
+from onnx_diagnostic.reference import OnnxruntimeEvaluator
+
+TBFLOAT16 = onnx.TensorProto.BFLOAT16
+
+model = oh.make_model(
+    oh.make_graph(
+        [
+            oh.make_node("Mul", ["X", "Y"], ["xy"], name="n0"),
+            oh.make_node("Sigmoid", ["xy"], ["sy"], name="n1"),
+            oh.make_node("Add", ["sy", "one"], ["C"], name="n2"),
+            oh.make_node("Cast", ["C"], ["X999"], to=999, name="failing"),
+            oh.make_node("CastLike", ["X999", "Y"], ["Z"], name="n4"),
+        ],
+        "nd",
+        [
+            oh.make_tensor_value_info("X", TBFLOAT16, ["a", "b", "c"]),
+            oh.make_tensor_value_info("Y", TBFLOAT16, ["a", "b", "c"]),
+        ],
+        [oh.make_tensor_value_info("Z", TBFLOAT16, ["a", "b", "c"])],
+        [from_array_extended(torch.tensor([1], dtype=torch.bfloat16), name="one")],
+    ),
+    opset_imports=[oh.make_opsetid("", 18)],
+    ir_version=9,
+)
+
+# %%
+# We check it is failing.
+
+try:
+    onnxruntime.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"])
+except onnxruntime.capi.onnxruntime_pybind11_state.Fail as e:
+    print(e)
+
+
+# %%
+# OnnxruntimeEvaluator
+# ++++++++++++++++++++++++++
+#
+# This class extends :class:`onnx.reference.ReferenceEvaluator`
+# with operators outside the standard but defined by :epkg:`onnxruntime`.
+# `verbose=10` tells the class to print as much as possible,
+# `verbose=0` prints nothing. Intermediate values for more or less verbosity.
+
+ref = OnnxruntimeEvaluator(model, verbose=10)
+feeds = dict(
+    X=torch.rand((3, 4), dtype=torch.bfloat16), Y=torch.rand((3, 4), dtype=torch.bfloat16)
+)
+try:
+    ref.run(None, feeds)
+except Exception as e:
+    print("ERROR", type(e), e)
+
+
+# %%
+# :epkg:`onnxruntime` may not support bfloat16 on CPU.
+# See :epkg:`onnxruntime kernels`.
+
+if has_cuda():
+    ref = OnnxruntimeEvaluator(model, providers="cuda", verbose=10)
+    feeds = dict(
+        X=torch.rand((3, 4), dtype=torch.bfloat16), Y=torch.rand((3, 4), dtype=torch.bfloat16)
+    )
+    try:
+        ref.run(None, feeds)
+    except Exception as e:
+        print("ERROR", type(e), e)
+
+# %%
+# We can see it run until it reaches `Cast` and stops.
+# The error message is not always obvious to interpret.
+# It gets improved everytime from time to time.
+# This runtime is useful when it fails for a numerical reason.
+# It is possible to insert prints in the python code to print
+# more information or debug if needed.
diff --git a/_doc/index.rst b/_doc/index.rst
index 01b63d2c..e6231eb6 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -8,18 +8,10 @@ onnx-diagnostic: investigate onnx models
 .. image:: https://badge.fury.io/py/onnx-diagnostic.svg
     :target: http://badge.fury.io/py/onnx-diagnostic
 
-.. image:: http://img.shields.io/github/issues/sdpython/onnx-diagnostic.png
-    :alt: GitHub Issues
-    :target: https://github.com/sdpython/onnx-diagnostic/issues
-
 .. image:: https://img.shields.io/badge/license-MIT-blue.svg
     :alt: MIT License
     :target: https://opensource.org/license/MIT/
 
-.. image:: https://img.shields.io/github/repo-size/sdpython/onnx-diagnostic
-    :target: https://github.com/sdpython/onnx-diagnostic/
-    :alt: size
-
 .. image:: https://img.shields.io/badge/code%20style-black-000000.svg
     :target: https://github.com/psf/black
 
@@ -51,6 +43,7 @@ Source are `sdpython/onnx-diagnostic
 * :ref:`l-plot-sxport-with-dynamio-shapes-auto`
 * :ref:`l-plot-tiny-llm-export`
 * :ref:`l-plot-failing-reference-evaluator`
+* :ref:`l-plot-failing-onnxruntime-evaluator`
 * :ref:`l-plot-failing-model-extract`
 
 **Some Usefuls Tools**
diff --git a/_unittests/ut_reference/test_array_tensor.py b/_unittests/ut_reference/test_array_tensor.py
index c6983427..8c10f124 100644
--- a/_unittests/ut_reference/test_array_tensor.py
+++ b/_unittests/ut_reference/test_array_tensor.py
@@ -2,9 +2,8 @@
 import numpy as np
 from onnx import TensorProto
 from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
-from onnx.reference.op_run import to_array_extended
 from onnx_diagnostic.ext_test_case import ExtTestCase, ignore_warnings
-from onnx_diagnostic.helpers import from_array_extended
+from onnx_diagnostic.helpers import from_array_extended, to_array_extended
 from onnx_diagnostic.reference import ExtendedReferenceEvaluator
 
 
diff --git a/_unittests/ut_reference/test_backend_extended_reference_evaluator.py b/_unittests/ut_reference/test_backend_extended_reference_evaluator.py
index 2a4d7768..5410dd9b 100644
--- a/_unittests/ut_reference/test_backend_extended_reference_evaluator.py
+++ b/_unittests/ut_reference/test_backend_extended_reference_evaluator.py
@@ -43,13 +43,13 @@ def run(self, inputs, **kwargs):
 
 class ExtendedReferenceEvaluatorBackend(onnx.backend.base.Backend):
     @classmethod
-    def is_opset_supported(cls, model):  # pylint: disable=unused-argument
-        return True, ""
+    def is_compatible(cls, model) -> bool:
+        return True
 
     @classmethod
     def supports_device(cls, device: str) -> bool:
         d = Device(device)
-        return d.type == DeviceType.CPU  # type: ignore[no-any-return]
+        return d.type == DeviceType.CPU
 
     @classmethod
     def create_inference_session(cls, model):
diff --git a/_unittests/ut_reference/test_backend_onnxruntime_evaluator.py b/_unittests/ut_reference/test_backend_onnxruntime_evaluator.py
new file mode 100644
index 00000000..0fcef585
--- /dev/null
+++ b/_unittests/ut_reference/test_backend_onnxruntime_evaluator.py
@@ -0,0 +1,248 @@
+import unittest
+import warnings
+from typing import Any
+import numpy
+import onnx.backend.base
+import onnx.backend.test
+import onnx.shape_inference
+import onnx.version_converter
+from onnx import ModelProto
+from onnx.backend.base import Device, DeviceType
+from onnx.defs import onnx_opset_version
+from onnx_diagnostic.reference import OnnxruntimeEvaluator
+
+ORT_OPSET = max(21, onnx_opset_version() - 2)
+
+
+class OnnxruntimeEvaluatorBackendRep(onnx.backend.base.BackendRep):
+    def __init__(self, session):
+        self._session = session
+
+    def run(self, inputs, **kwargs):
+        if isinstance(inputs, numpy.ndarray):
+            inputs = [inputs]
+        if isinstance(inputs, list):
+            if len(inputs) == len(self._session.input_names):
+                feeds = dict(zip(self._session.input_names, inputs))
+            else:
+                feeds = {}
+                pos_inputs = 0
+                for inp, tshape in zip(self._session.input_names, self._session.input_types):
+                    shape = tuple(d.dim_value for d in tshape.tensor_type.shape.dim)
+                    if shape == inputs[pos_inputs].shape:
+                        feeds[inp] = inputs[pos_inputs]
+                        pos_inputs += 1
+                        if pos_inputs >= len(inputs):
+                            break
+        elif isinstance(inputs, dict):
+            feeds = inputs
+        else:
+            raise TypeError(f"Unexpected input type {type(inputs)!r}.")
+        outs = self._session.run(None, feeds)
+        return outs
+
+
+class OnnxruntimeEvaluatorBackend(onnx.backend.base.Backend):
+    @classmethod
+    def is_compatible(cls, model) -> bool:
+        return all(not (d.domain == "" and d.version > ORT_OPSET) for d in model.opset_import)
+
+    @classmethod
+    def supports_device(cls, device: str) -> bool:
+        d = Device(device)
+        return d.type == DeviceType.CPU
+
+    @classmethod
+    def create_inference_session(cls, model):
+        return OnnxruntimeEvaluator(model)
+
+    @classmethod
+    def prepare(
+        cls, model: Any, device: str = "CPU", **kwargs: Any
+    ) -> OnnxruntimeEvaluatorBackendRep:
+        if isinstance(model, OnnxruntimeEvaluator):
+            return OnnxruntimeEvaluatorBackendRep(model)
+        if isinstance(model, (str, bytes, ModelProto)):
+            inf = cls.create_inference_session(model)
+            return cls.prepare(inf, device, **kwargs)
+        raise TypeError(f"Unexpected type {type(model)} for model.")
+
+    @classmethod
+    def run_model(cls, model, inputs, device=None, **kwargs):
+        rep = cls.prepare(model, device, **kwargs)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return rep.run(inputs, **kwargs)
+
+    @classmethod
+    def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
+        raise NotImplementedError("Unable to run the model node by node.")
+
+
+dft_atol = 1e-3
+stft_atol = 1e-4
+ql_atol = 1e-5
+backend_test = onnx.backend.test.BackendTest(
+    OnnxruntimeEvaluatorBackend,
+    __name__,
+    test_kwargs={
+        "test_dft": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_dft_axis": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_dft_axis_opset19": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_dft_inverse": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_dft_inverse_opset19": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_dft_opset19": {"atol": dft_atol, "rtol": numpy.inf},
+        "test_stft": {"atol": stft_atol, "rtol": numpy.inf},
+        "test_stft_with_window": {"atol": stft_atol, "rtol": numpy.inf},
+        "test_qlinearmatmul_2D_int8_float32": {"atol": ql_atol},
+        "test_qlinearmatmul_3D_int8_float32": {"atol": ql_atol},
+    },
+)
+
+# rtol=inf does not work
+backend_test.exclude("(test_dft|test_stft)")
+
+# The following tests are too slow with the reference implementation (Conv).
+backend_test.exclude(
+    "(test_bvlc_alexnet"
+    "|test_densenet121"
+    "|test_inception_v1"
+    "|test_inception_v2"
+    "|test_resnet50"
+    "|test_shufflenet"
+    "|test_squeezenet"
+    "|test_vgg19"
+    "|test_zfnet512)"
+)
+
+# The following tests cannot pass because they consists in generating random number.
+backend_test.exclude("(test_bernoulli|test_PoissonNLLLLoss)")
+
+# The following tests are not supported.
+backend_test.exclude(
+    "(test_gradient"
+    "|test_if_opt"
+    "|test_loop16_seq_none"
+    "|test_range_float_type_positive_delta_expanded"
+    "|test_range_int32_type_negative_delta_expanded"
+    "|test_scan_sum)"
+)
+
+if onnx_opset_version() < 21:
+    backend_test.exclude(
+        "(test_averagepool_2d_dilations"
+        "|test_if*"
+        "|test_loop*"
+        "|test_scan*"
+        "|test_sequence_map*"
+        "|test_cast_FLOAT_to_STRING|"
+        "test_castlike_FLOAT_to_STRING|test_strnorm|"
+        "test_center_crop_pad_crop_axes_hwc_expanded|"
+        "test_lppool_2d_dilations|test_eyelike_without_dtype)"
+    )
+
+# Disable test about float 8
+backend_test.exclude(
+    "(test_castlike_BFLOAT16*"
+    "|test_cast_BFLOAT16*"
+    "|test_cast_no_saturate*"
+    "|test_cast_FLOAT_to_FLOAT8*"
+    "|test_cast_FLOAT16_to_FLOAT8*"
+    "|test_cast_FLOAT8_to_*"
+    "|test_castlike_BFLOAT16*"
+    "|test_castlike_no_saturate*"
+    "|test_castlike_FLOAT_to_FLOAT8*"
+    "|test_castlike_FLOAT16_to_FLOAT8*"
+    "|test_castlike_FLOAT8_to_*"
+    "|test_quantizelinear_e*)"
+)
+
+# Disable test about INT 4
+backend_test.exclude(
+    "(test_cast_FLOAT_to_INT4"
+    "|test_cast_FLOAT16_to_INT4"
+    "|test_cast_INT4_to_"
+    "|test_castlike_INT4_to_"
+    "|test_cast_FLOAT_to_UINT4"
+    "|test_cast_FLOAT16_to_UINT4"
+    "|test_cast_UINT4_to_"
+    "|test_castlike_UINT4_to_)"
+)
+
+backend_test.exclude(
+    "(test_regex_full_match|"
+    "test_adagrad|"
+    "test_adam|"
+    "test_add_uint8|"
+    "test_ai_onnx_ml_label_encoder_string|"
+    "test_ai_onnx_ml_label_encoder_tensor_mapping|"
+    "test_ai_onnx_ml_label_encoder_tensor_value_only_mapping|"
+    "test_AvgPool|"
+    "test_BatchNorm|"
+    "test_bitshift_[a-z]+_uint16|"
+    "test_center_crop_pad_crop|"
+    "test_clip_[0-9a-z_]*expanded|"
+    "test_elu_[0-9a-z_]*expanded|"
+    "test_equal_string|"
+    "test_GLU_|"
+    "test_identity_opt|"
+    "test_if|"
+    "test_image|"
+    "test_leakyrelu|"
+    "test_((less)|(greater))_equal_bcast|"
+    "test_((less)|(greater))[a-z_]*expanded|"
+    "test_Linear|"
+    "test_loop13|"
+    "test_momentum|"
+    "test_nesterov|"
+    "test_((mul)|(min)|(max)|(div))_u?int((8)|(16))|"
+    "test_operator|"
+    "test_optional_|"
+    "test_pow_types_float32_uint|"
+    "test_qlinearmatmul|"
+    "test_prelu|"
+    "test_PReLU|"
+    "test_reduce_max_empty|"
+    "test_resize_downsample_scales|"
+    "test_scatter_with_axis|"
+    "test_scatter_without_axis"
+    "|test_selu"
+    "|test_sequence"
+    "|test_shrink_"
+    "|test_Softsign"
+    "|test_split_to_sequence"
+    "|test_string_concat"
+    "|test_string_split"
+    "|test_strnorm_model"
+    "|test_strnormalizer"
+    "|test_sub_uint8"
+    "|test_thresholdedrelu"
+    "|test_top_k_uint64"
+    ")"
+)
+
+# failing on CI only
+backend_test.exclude(
+    "(_to_STRING|to_BFLOAT16|STRING_to|BFLOAT16_to|"
+    "test_constant|test_(de)?quantizelinear_u?int4"
+    "|test_identity_sequence"
+    ")"
+)
+
+
+# import all test cases at global scope to make them visible to python.unittest
+globals().update(backend_test.test_cases)
+
+if __name__ == "__main__":
+    res = unittest.main(verbosity=2, exit=False)
+    tests_run = res.result.testsRun
+    errors = len(res.result.errors)
+    skipped = len(res.result.skipped)
+    unexpected_successes = len(res.result.unexpectedSuccesses)
+    expected_failures = len(res.result.expectedFailures)
+    print("---------------------------------")
+    print(
+        f"tests_run={tests_run} errors={errors} skipped={skipped} "
+        f"unexpected_successes={unexpected_successes} "
+        f"expected_failures={expected_failures}"
+    )
diff --git a/_unittests/ut_reference/test_ort_evaluator.py b/_unittests/ut_reference/test_ort_evaluator.py
new file mode 100644
index 00000000..74994f0b
--- /dev/null
+++ b/_unittests/ut_reference/test_ort_evaluator.py
@@ -0,0 +1,253 @@
+import unittest
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import ml_dtypes
+from onnx import ModelProto, TensorProto
+from onnx.checker import check_model
+import onnx.helper as oh
+import onnx.numpy_helper as onh
+import torch
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    hide_stdout,
+    ignore_warnings,
+    requires_cuda,
+)
+from onnx_diagnostic.helpers import (
+    from_array_extended,
+    onnx_dtype_to_torch_dtype,
+    onnx_dtype_to_np_dtype,
+)
+from onnx_diagnostic.reference import ExtendedReferenceEvaluator, OnnxruntimeEvaluator
+
+TFLOAT = TensorProto.FLOAT
+
+
+class TestOnnxruntimeEvaluatoruator(ExtTestCase):
+    def _range(self, *shape, bias: Optional[float] = None):
+        n = np.prod(shape)
+        x = np.arange(n).astype(np.float32) / n
+        if bias:
+            x = x + bias
+        return x.reshape(tuple(shape)).astype(np.float32)
+
+    def _get_model(self) -> ModelProto:
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("Unsqueeze", ["X", "zero"], ["xu1"]),
+                    oh.make_node("Unsqueeze", ["xu1", "un"], ["xu2"]),
+                    oh.make_node("Reshape", ["xu2", "shape1"], ["xm1"]),
+                    oh.make_node("Reshape", ["Y", "shape2"], ["xm2c"]),
+                    oh.make_node("Cast", ["xm2c"], ["xm2"], to=1),
+                    oh.make_node("MatMul", ["xm1", "xm2"], ["xm"]),
+                    oh.make_node("Reshape", ["xm", "shape3"], ["Z"]),
+                ],
+                "dummy",
+                [
+                    oh.make_tensor_value_info("X", TFLOAT, [32, 128]),
+                    oh.make_tensor_value_info("Y", TFLOAT, [3, 5, 128, 64]),
+                ],
+                [oh.make_tensor_value_info("Z", TFLOAT, [3, 5, 32, 64])],
+                [
+                    onh.from_array(np.array([0], dtype=np.int64), name="zero"),
+                    onh.from_array(np.array([1], dtype=np.int64), name="un"),
+                    onh.from_array(np.array([1, 32, 128], dtype=np.int64), name="shape1"),
+                    onh.from_array(np.array([15, 128, 64], dtype=np.int64), name="shape2"),
+                    onh.from_array(np.array([3, 5, 32, 64], dtype=np.int64), name="shape3"),
+                ],
+            ),
+            ir_version=9,
+            opset_imports=[oh.make_opsetid("", 18)],
+        )
+        check_model(model)
+        return model
+
+    @ignore_warnings(DeprecationWarning)
+    def test_ort_eval(self):
+        model = self._get_model()
+
+        feeds = {"X": self._range(32, 128), "Y": self._range(3, 5, 128, 64)}
+        ref = ExtendedReferenceEvaluator(model, verbose=10)
+        expected, out, _ = self.capture(lambda: ref.run(None, feeds)[0])
+        self.assertIn("Reshape(xm, shape3) -> Z", out)
+
+        ort_eval = OnnxruntimeEvaluator(model, verbose=10, opsets=20)
+        got, out, _ = self.capture(lambda: ort_eval.run(None, feeds)[0])
+        self.assertEqualArray(expected, got, atol=1e-4)
+        self.assertIn("Reshape(xm, shape3) -> Z", out)
+
+    @ignore_warnings(DeprecationWarning)
+    @requires_cuda()
+    @hide_stdout()
+    def test_ort_eval_cuda(self):
+        model = self._get_model()
+
+        feeds = {"X": self._range(32, 128), "Y": self._range(3, 5, 128, 64)}
+        ref = ExtendedReferenceEvaluator(model, verbose=10)
+        expected = ref.run(None, feeds)[0]
+
+        ort_eval = OnnxruntimeEvaluator(model, verbose=10, opsets=20, providers="cuda")
+        got = ort_eval.run(None, feeds)[0]
+        self.assertEqualArray(expected, got, atol=1e-1)
+
+    @ignore_warnings(DeprecationWarning)
+    @hide_stdout()
+    def test_ort_eval_node_proto(self):
+        model = self._get_model()
+
+        feeds = {"X": self._range(32, 128), "zero": np.array([0], dtype=np.int64)}
+        ref = ExtendedReferenceEvaluator(model.graph.node[0], verbose=10)
+        expected = ref.run(None, feeds)
+
+        ort_eval = OnnxruntimeEvaluator(model.graph.node[0], verbose=10, opsets=20)
+        got = ort_eval.run(None, feeds)
+        self.assertEqualArrayAny(expected, got, atol=1e-4)
+        self.assertIsInstance(expected[0], np.ndarray)
+        self.assertIsInstance(got[0], np.ndarray)
+
+    @ignore_warnings(DeprecationWarning)
+    @hide_stdout()
+    def test_ort_eval_node_proto_torch(self):
+        model = self._get_model()
+
+        feeds_np = {"X": self._range(32, 128), "zero": np.array([0], dtype=np.int64)}
+        feeds = {k: torch.from_numpy(v) for k, v in feeds_np.items()}
+        ref = ExtendedReferenceEvaluator(model.graph.node[0], verbose=10)
+        expected = ref.run(None, feeds_np)
+
+        ort_eval = OnnxruntimeEvaluator(model.graph.node[0], verbose=10, opsets=20)
+        got = ort_eval.run(None, feeds)
+        self.assertIsInstance(got[0], torch.Tensor)
+        self.assertEqualArray(expected[0], got[0], atol=1e-4)
+
+    @hide_stdout()
+    def test_local_function(self):
+        new_domain = "custom"
+
+        linear_regression = oh.make_function(
+            new_domain,
+            "LinearRegression",
+            ["x", "a", "b"],
+            ["y"],
+            [
+                oh.make_node("MatMul", ["x", "a"], ["xa"]),
+                oh.make_node("Add", ["xa", "b"], ["y"]),
+            ],
+            [oh.make_opsetid("", 14)],
+            [],
+        )
+
+        graph = oh.make_graph(
+            [
+                oh.make_node("LinearRegression", ["X", "A", "B"], ["Y1"], domain=new_domain),
+                oh.make_node("Abs", ["Y1"], ["Y"]),
+            ],
+            "example",
+            [
+                oh.make_tensor_value_info("X", TFLOAT, [None, None]),
+                oh.make_tensor_value_info("A", TFLOAT, [None, None]),
+                oh.make_tensor_value_info("B", TFLOAT, [None, None]),
+            ],
+            [oh.make_tensor_value_info("Y", TFLOAT, None)],
+        )
+
+        onnx_model = oh.make_model(
+            graph,
+            opset_imports=[oh.make_opsetid("", 14), oh.make_opsetid(new_domain, 1)],
+            functions=[linear_regression],
+            ir_version=10,
+        )
+        feeds = {
+            "X": np.random.randn(3, 3).astype(np.float32),
+            "A": np.random.randn(3, 3).astype(np.float32),
+            "B": np.random.randn(3, 3).astype(np.float32),
+        }
+        ref = ExtendedReferenceEvaluator(onnx_model)
+        ort_eval = OnnxruntimeEvaluator(onnx_model, verbose=10, opsets=20)
+        expected = ref.run(None, feeds)
+        got = ort_eval.run(None, feeds)
+        self.assertEqualArray(expected[0], got[0])
+
+    @classmethod
+    def _trange(cls, *shape, bias: Optional[float] = None):
+        n = np.prod(shape)
+        x = np.arange(n).astype(np.float32) / n
+        if bias:
+            x = x + bias
+        return torch.from_numpy(x.reshape(tuple(shape)).astype(np.float32))
+
+    @classmethod
+    def _get_model_init(cls, itype) -> Tuple[ModelProto, Dict[str, Any], Tuple[Any, ...]]:
+        dtype = onnx_dtype_to_np_dtype(itype)
+        ttype = onnx_dtype_to_torch_dtype(itype)
+        cst = np.arange(6).astype(dtype)
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("IsNaN", ["x"], ["xi"]),
+                    oh.make_node("IsNaN", ["y"], ["yi"]),
+                    oh.make_node("Cast", ["xi"], ["xii"], to=TensorProto.INT64),
+                    oh.make_node("Cast", ["yi"], ["yii"], to=TensorProto.INT64),
+                    oh.make_node("Add", ["xii", "yii"], ["gggg"]),
+                    oh.make_node("Cast", ["gggg"], ["final"], to=itype),
+                ],
+                "dummy",
+                [oh.make_tensor_value_info("x", itype, [None, None])],
+                [oh.make_tensor_value_info("final", itype, [None, None])],
+                [from_array_extended(cst, name="y")],
+            ),
+            opset_imports=[oh.make_opsetid("", 20)],
+            ir_version=10,
+        )
+        feeds = {"x": cls._trange(5, 6).to(ttype)}
+        expected = torch.isnan(feeds["x"]).to(int) + torch.isnan(
+            torch.from_numpy(cst.astype(float))
+        ).to(int)
+        return (model, feeds, (expected.to(ttype),))
+
+    @hide_stdout()
+    def test_init_numpy_afloat32(self):
+        model, feeds, expected = self._get_model_init(TensorProto.FLOAT)
+        wrap = OnnxruntimeEvaluator(
+            model, providers="cpu", graph_optimization_level=False, verbose=10
+        )
+        got = wrap.run(None, {k: v.numpy() for k, v in feeds.items()})
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    @hide_stdout()
+    def test_init_numpy_bfloat16(self):
+        model, feeds, expected = self._get_model_init(TensorProto.BFLOAT16)
+        wrap = OnnxruntimeEvaluator(
+            model, providers="cpu", graph_optimization_level=False, verbose=10
+        )
+        got = wrap.run(
+            None, {k: v.to(float).numpy().astype(ml_dtypes.bfloat16) for k, v in feeds.items()}
+        )
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    @hide_stdout()
+    def test_init_torch_afloat32(self):
+        model, feeds, expected = self._get_model_init(TensorProto.FLOAT)
+        wrap = OnnxruntimeEvaluator(
+            model, providers="cpu", graph_optimization_level=False, verbose=10
+        )
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], (torch.Tensor, np.ndarray))
+        self.assertEqualArray(expected[0], got[0])
+
+    @hide_stdout()
+    def test_init_torch_bfloat16(self):
+        model, feeds, expected = self._get_model_init(TensorProto.BFLOAT16)
+        wrap = OnnxruntimeEvaluator(
+            model, providers="cpu", graph_optimization_level=False, verbose=10
+        )
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], (torch.Tensor, np.ndarray))
+        self.assertEqualArray(expected[0], got[0])
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_xrun_doc/test_helpers.py b/_unittests/ut_xrun_doc/test_helpers.py
index 4033e5b3..0fef8bd1 100644
--- a/_unittests/ut_xrun_doc/test_helpers.py
+++ b/_unittests/ut_xrun_doc/test_helpers.py
@@ -5,7 +5,12 @@
 import onnx
 import onnx.helper as oh
 import torch
-from onnx_diagnostic.ext_test_case import ExtTestCase, skipif_ci_windows, hide_stdout
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    skipif_ci_windows,
+    hide_stdout,
+    requires_onnx,
+)
 from onnx_diagnostic.helpers import (
     string_type,
     string_sig,
@@ -19,9 +24,11 @@
     string_signature,
     make_hash,
     onnx_dtype_to_torch_dtype,
+    onnx_dtype_to_np_dtype,
     np_dtype_to_tensor_dtype,
     torch_dtype_to_onnx_dtype,
     from_array_extended,
+    to_array_extended,
     convert_endian,
     from_array_ml_dtypes,
     dtype_to_tensor_dtype,
@@ -213,6 +220,7 @@ def test_size_type_onnx(self):
                     "FLOAT8E4M3FNUZ",
                 }:
                     onnx_dtype_to_torch_dtype(i)
+                    onnx_dtype_to_np_dtype(i)
 
     def test_size_type_numpy(self):
         for dt in {
@@ -248,16 +256,21 @@ def test_from_array(self):
             t = np.random.rand(4, 3).astype(dt)
             proto = from_array_extended(t)
             self.assertIsInstance(proto, onnx.TensorProto)
-            convert_endian(proto)
             dtype_to_tensor_dtype(dt)
+            arr = to_array_extended(proto)
+            self.assertEqualArray(t, arr)
+            convert_endian(proto)
 
+    @requires_onnx("1.18.0")
     def test_from_array_ml_dtypes(self):
         for dt in {
             ml_dtypes.bfloat16,
         }:
             t = np.random.rand(4, 3).astype(dt)
-            from_array_ml_dtypes(t)
+            proto = from_array_ml_dtypes(t)
             from_array_extended(t)
+            arr = to_array_extended(proto)
+            self.assertEqualArray(t, arr)
 
     def test_size_type_mldtypes(self):
         for dt in {
@@ -407,6 +420,28 @@ def test_rename_dynamic_expression(self):
         text = rename_dynamic_expression("a * 10 - a", {"a": "x"})
         self.assertEqual(text, "x * 10 - x")
 
+    def test_from_tensor(self):
+        for dt in {
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+            torch.float16,
+            torch.int32,
+            torch.int64,
+            torch.int8,
+            torch.int16,
+            torch.uint8,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        }:
+            t = torch.arange(12).reshape((4, 3)).to(dt)
+            from_array_extended(t)
+            proto = from_array_extended(t, name="a")
+            self.assertIsInstance(proto, onnx.TensorProto)
+            convert_endian(proto)
+            dtype_to_tensor_dtype(dt)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_xrun_doc/test_ort_session.py b/_unittests/ut_xrun_doc/test_ort_session.py
index c6860faf..e76297d0 100644
--- a/_unittests/ut_xrun_doc/test_ort_session.py
+++ b/_unittests/ut_xrun_doc/test_ort_session.py
@@ -1,6 +1,7 @@
 import unittest
-from typing import Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
+import ml_dtypes
 import onnx
 import onnx.helper as oh
 import torch
@@ -11,6 +12,11 @@
     requires_onnxruntime_training,
     requires_cuda,
 )
+from onnx_diagnostic.helpers import (
+    from_array_extended,
+    onnx_dtype_to_np_dtype,
+    onnx_dtype_to_torch_dtype,
+)
 from onnx_diagnostic.ort_session import (
     InferenceSessionForNumpy,
     InferenceSessionForTorch,
@@ -232,6 +238,66 @@ def test_investigate_onnxruntime_issue_callable_str(self):
             onnx_to_session="cpu_session",
         )
 
+    @classmethod
+    def _get_model_init(cls, itype) -> Tuple[onnx.ModelProto, Dict[str, Any], Tuple[Any, ...]]:
+        dtype = onnx_dtype_to_np_dtype(itype)
+        ttype = onnx_dtype_to_torch_dtype(itype)
+        cst = np.arange(6).astype(dtype)
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("IsNaN", ["x"], ["xi"]),
+                    oh.make_node("IsNaN", ["y"], ["yi"]),
+                    oh.make_node("Cast", ["xi"], ["xii"], to=onnx.TensorProto.INT64),
+                    oh.make_node("Cast", ["yi"], ["yii"], to=onnx.TensorProto.INT64),
+                    oh.make_node("Add", ["xii", "yii"], ["gggg"]),
+                    oh.make_node("Cast", ["gggg"], ["final"], to=itype),
+                ],
+                "dummy",
+                [oh.make_tensor_value_info("x", itype, [None, None])],
+                [oh.make_tensor_value_info("final", itype, [None, None])],
+                [from_array_extended(cst, name="y")],
+            ),
+            opset_imports=[oh.make_opsetid("", 20)],
+            ir_version=10,
+        )
+        onnx.checker.check_model(model)
+        feeds = {"x": cls._range(5, 6).to(ttype)}
+        expected = torch.isnan(feeds["x"]).to(int) + torch.isnan(
+            torch.from_numpy(cst.astype(float))
+        ).to(int)
+        return (model, feeds, (expected.to(ttype),))
+
+    def test_init_numpy_afloat32(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.FLOAT)
+        wrap = InferenceSessionForNumpy(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, {k: v.numpy() for k, v in feeds.items()})
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_numpy_bfloat16(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.BFLOAT16)
+        wrap = InferenceSessionForNumpy(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(
+            None, {k: v.to(float).numpy().astype(ml_dtypes.bfloat16) for k, v in feeds.items()}
+        )
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_torch_afloat32(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.FLOAT)
+        wrap = InferenceSessionForTorch(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], torch.Tensor)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_torch_bfloat16(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.BFLOAT16)
+        wrap = InferenceSessionForTorch(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], torch.Tensor)
+        self.assertEqualArray(expected[0], got[0])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers.py b/onnx_diagnostic/helpers.py
index 31dc7f5f..b3fbe468 100644
--- a/onnx_diagnostic/helpers.py
+++ b/onnx_diagnostic/helpers.py
@@ -1,4 +1,5 @@
 import ast
+import ctypes
 import enum
 import functools
 import inspect
@@ -21,8 +22,7 @@
     np_dtype_to_tensor_dtype as onnx_np_dtype_to_tensor_dtype,
     tensor_dtype_to_np_dtype as onnx_tensor_dtype_to_np_dtype,
 )
-from onnx.numpy_helper import from_array as onnx_from_array
-from onnx.reference.op_run import to_array_extended
+from onnx.numpy_helper import from_array as onnx_from_array, to_array
 
 
 def size_type(dtype: Any) -> int:
@@ -317,6 +317,7 @@ def string_type(
         if not with_shape:
             return f"{prefix}F{i}r{len(obj.shape)}"
         return f"{prefix}F{i}s{'x'.join(map(str, obj.shape))}"
+
     if isinstance(obj, torch.Tensor):
         if with_min_max:
             s = string_type(obj, with_shape=with_shape, with_device=with_device)
@@ -341,6 +342,25 @@ def string_type(
         if not with_shape:
             return f"{prefix}T{i}r{len(obj.shape)}"
         return f"{prefix}T{i}s{'x'.join(map(str, obj.shape))}"
+
+    if obj.__class__.__name__ == "OrtValue":
+        if not obj.has_value():
+            return "OV(<novalue>)"
+        if not obj.is_tensor():
+            return "OV(NOTENSOR)"
+        if with_min_max:
+            try:
+                t = obj.numpy()
+            except Exception:
+                # pass unable to convert into numpy (bfloat16, ...)
+                return "OV(NO-NUMPY:FIXIT)"
+            return f"OV({string_type(t, with_shape=with_shape, with_min_max=with_min_max)})"
+        dt = obj.element_type()
+        shape = obj.shape()
+        if with_shape:
+            return f"OV{dt}s{'x'.join(map(str, shape))}"
+        return f"OV{dt}r{len(shape)}"
+
     if isinstance(obj, bool):
         if with_min_max:
             return f"bool={obj}"
@@ -442,6 +462,7 @@ def string_type(
 
     if ignore:
         return f"{obj.__class__.__name__}(...)"
+
     raise AssertionError(f"Unsupported type {type(obj).__name__!r} - {type(obj)}")
 
 
@@ -709,14 +730,97 @@ def from_array_ml_dtypes(arr: npt.ArrayLike, name: Optional[str] = None) -> Tens
     return tensor
 
 
+_STORAGE_TYPE = {
+    TensorProto.FLOAT16: np.int16,
+    TensorProto.BFLOAT16: np.int16,
+}
+
+
+def proto_from_tensor(
+    arr: "torch.Tensor",  # noqa: F821
+    name: Optional[str] = None,
+    verbose: int = 0,
+) -> TensorProto:
+    """
+    Converts a torch Tensor into a TensorProto.
+
+    :param arr: tensor
+    :param verbose: display the type and shape
+    :return: a TensorProto
+    """
+    import torch
+
+    if not isinstance(arr, torch.Tensor):
+        raise TypeError(f"Unexpected type {type(arr)}.")
+    if arr.is_sparse:
+        raise NotImplementedError(
+            f"Sparse tensor is not supported yet but initializer {name!r} is."
+        )
+
+    # arr.contiguous() is slow after a transpose, maybe there is a way to optimize this.
+    if arr.is_contiguous():
+        arr_cpu = arr.cpu()
+    else:
+        arr_cpu = arr.contiguous().cpu()
+
+    numel = torch.numel(arr_cpu)
+    element_size = arr_cpu.element_size()
+
+    if arr_cpu.dtype in {torch.bfloat16}:
+        np_arr = arr_cpu
+    elif arr_cpu.data_ptr() == arr.data_ptr():
+        copy = arr_cpu.clone().detach().requires_grad_(False)
+        assert (
+            arr_cpu.data_ptr() == 0 or arr_cpu.data_ptr() != copy.data_ptr()
+        ), f"Pointers are not null and different {arr_cpu.data_ptr()} != {copy.data_ptr()}"
+        np_arr = np.from_dlpack(copy)
+    else:
+        np_arr = np.from_dlpack(arr_cpu.detach())
+
+    tensor = TensorProto()
+    tensor.dims.extend(arr_cpu.shape)
+    if name:
+        tensor.name = name
+    itype = torch_dtype_to_onnx_dtype(arr_cpu.dtype)
+    assert not hasattr(TensorProto, "INT4") or itype not in {
+        TensorProto.INT4,
+        TensorProto.UINT4,
+    }, f"Type {arr.dtype} is not supported yet for name={name!r}"
+    tensor.data_type = itype
+
+    if verbose > 1 and numel > 100:
+        print(f"[proto_from_array] {tensor.data_type}[{arr_cpu.shape}]")
+
+    if isinstance(np_arr, torch.Tensor):
+        byte_data = (ctypes.c_ubyte * numel * element_size).from_address(np_arr.data_ptr())
+        tensor.raw_data = bytes(byte_data)
+        if sys.byteorder == "big":
+            np_dtype = _STORAGE_TYPE[tensor.data_type]  # type: ignore
+            np.byteswap(np.frombuffer(tensor.raw_data, dtype=np_dtype), inplace=True)
+    else:
+        tensor.raw_data = np_arr.tobytes()
+        if sys.byteorder == "big":
+            np_dtype = tensor_dtype_to_np_dtype(tensor.data_type)
+            np.byteswap(np.frombuffer(tensor.raw_data, dtype=np_dtype), inplace=True)
+
+    return tensor
+
+
 def from_array_extended(tensor: npt.ArrayLike, name: Optional[str] = None) -> TensorProto:
     """
     Converts an array into a :class:`onnx.TensorProto`.
 
-    :param tensor: numpy array
+    :param tensor: numpy array or torch tensor
     :param name: name
     :return: TensorProto
     """
+    try:
+        import torch
+    except ImportError:
+        torch = None
+    if torch is not None and isinstance(tensor, torch.Tensor):
+        return proto_from_tensor(tensor, name=name)
+
     from onnx.reference.ops.op_cast import (
         bfloat16,
         float8e4m3fn,
@@ -761,6 +865,16 @@ def from_array_extended(tensor: npt.ArrayLike, name: Optional[str] = None) -> Te
     return t
 
 
+def to_array_extended(proto: TensorProto) -> npt.ArrayLike:
+    """Converts :class:`onnx.TensorProto` into a numpy array."""
+    arr = to_array(proto)
+    if proto.data_type >= onnx.TensorProto.BFLOAT16:
+        # Types not supported by numpy
+        ml_dtypes = onnx_dtype_to_np_dtype(proto.data_type)
+        return arr.view(ml_dtypes)
+    return arr
+
+
 def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
     """
     Converts an onnx type into a torch dtype.
@@ -805,6 +919,51 @@ def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
     )
 
 
+def onnx_dtype_to_np_dtype(itype: int) -> Any:
+    """
+    Converts an onnx type into a to numpy dtype.
+    That includes :epkg:`ml_dtypes` dtypes.
+
+    :param to: onnx dtype
+    :return: numpy dtype
+    """
+    if itype == TensorProto.FLOAT:
+        return np.float32
+    if itype == TensorProto.FLOAT16:
+        return np.float16
+    if itype == TensorProto.BFLOAT16:
+        import ml_dtypes
+
+        return ml_dtypes.bfloat16
+    if itype == TensorProto.DOUBLE:
+        return np.float64
+    if itype == TensorProto.INT32:
+        return np.int32
+    if itype == TensorProto.INT64:
+        return np.int64
+    if itype == TensorProto.UINT32:
+        return np.uint32
+    if itype == TensorProto.UINT64:
+        return np.uint64
+    if itype == TensorProto.BOOL:
+        return np.bool
+    if itype == TensorProto.INT16:
+        return np.int16
+    if itype == TensorProto.UINT16:
+        return np.uint16
+    if itype == TensorProto.INT8:
+        return np.int16
+    if itype == TensorProto.UINT8:
+        return np.uint16
+    if itype == TensorProto.COMPLEX64:
+        return np.complex64
+    if itype == TensorProto.COMPLEX128:
+        return np.complex128
+    raise NotImplementedError(
+        f"Unable to convert onnx type {onnx_dtype_name(itype)} to torch.type."
+    )
+
+
 def torch_dtype_to_onnx_dtype(to: "torch.dtype") -> int:  # noqa: F821
     """
     Converts a torch dtype into a onnx element type.
diff --git a/onnx_diagnostic/ort_session.py b/onnx_diagnostic/ort_session.py
index c8135cb5..8b8141e1 100644
--- a/onnx_diagnostic/ort_session.py
+++ b/onnx_diagnostic/ort_session.py
@@ -6,6 +6,13 @@
 from torch._C import _from_dlpack
 import onnxruntime
 from onnxruntime.capi import _pybind_state as ORTC
+from .helpers import (
+    torch_dtype_to_onnx_dtype,
+    onnx_dtype_to_np_dtype,
+    np_dtype_to_tensor_dtype,
+    onnx_dtype_name,
+    size_type,
+)
 
 DEVICES = {-1: ORTC.OrtDevice(ORTC.OrtDevice.cpu(), ORTC.OrtDevice.default_memory(), 0)}
 
@@ -48,7 +55,14 @@ def __init__(
     ):
         # onnxruntime is importing when needed as it takes a
         # couple of seconds if it contains CUDA EP.
+        can_use_training_api = True
         if isinstance(sess, (onnx.ModelProto, str)):
+            if isinstance(sess, onnx.ModelProto):
+                for i in sess.graph.initializer:
+                    if i.data_type >= onnx.TensorProto.BFLOAT16:
+                        # Cannot use training api as it relies too much on numpy.
+                        can_use_training_api = False
+                        break
             assert session_options is None or (
                 providers is None
                 and graph_optimization_level is None
@@ -113,7 +127,7 @@ def __init__(
         if log_verbosity_level is not None:
             self.run_options.log_verbosity_level = log_verbosity_level
 
-        self.use_training_api = (
+        self.use_training_api = can_use_training_api and (
             self.has_onnxruntime_training() if use_training_api is None else use_training_api
         )
 
@@ -174,9 +188,75 @@ def __init__(
 
     def run(
         self, output_names: Optional[List[str]], feeds: Dict[str, npt.ArrayLike]
-    ) -> List[npt.ArrayLike]:
+    ) -> List[Optional[npt.ArrayLike]]:
         """Calls :meth:`onnxruntime.InferenceSession.run`."""
-        return self.sess.run(output_names, feeds)
+        # sess.run does not support blfoat16
+        # res = self.sess.run(output_names, feeds)
+        return list(self.run_dlpack(output_names, feeds))
+
+    def run_dlpack(
+        self, output_names: Optional[List[str]], feeds: Dict[str, npt.ArrayLike]
+    ) -> Tuple[Optional[npt.ArrayLike], ...]:
+        """
+        Same as :meth:`onnxruntime.InferenceSession.run` except that
+        feeds is a dictionary of :class:`np.ndarray`.
+        The output device is CPU even if the outputs are on CUDA.
+        """
+        new_feeds = {}
+        for k, v in feeds.items():
+            if not k:
+                continue
+            new_feeds[k] = (
+                ORTC.OrtValue.ortvalue_from_numpy_with_onnx_type(
+                    v, np_dtype_to_tensor_dtype(v.dtype)
+                )
+                if isinstance(v, np.ndarray)
+                else ORTC.OrtValue.from_dlpack(v.__dlpack__(), v.dtype == torch.bool)
+            )
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_push("run_with_ort_values")
+        ort_outputs = self.sess._sess.run_with_ort_values(
+            new_feeds, output_names or self.output_names, self.run_options
+        )
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_pop()
+        pth_outputs = self._ortvalues_to_numpy_tensor(ort_outputs)
+        return pth_outputs
+
+    def _ortvalues_to_numpy_tensor(
+        self,
+        ortvalues: Union[List[ORTC.OrtValue], ORTC.OrtValueVector],
+    ) -> Tuple[Optional[npt.ArrayLike], ...]:
+        if len(ortvalues) == 0:
+            return tuple()
+
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_push("_ortvalues_to_numpy_tensor")
+        res: List[Optional[npt.ArrayLike]] = []  # noqa: F823
+        for i in range(len(ortvalues)):
+            if not ortvalues[i].has_value():
+                res.append(None)
+                continue
+
+            el_type = ortvalues[i].element_type()
+            if el_type < onnx.TensorProto.BFLOAT16:
+                res.append(np.from_dlpack(ortvalues[i]))
+                continue
+
+            # no easy conversion, let's use torch
+            tch = torch.from_dlpack(ortvalues[i].to_dlpack())
+            size = size_type(el_type)
+            assert size == 2, f"Not implemented for type {onnx_dtype_name(el_type)}"
+            it = torch.uint16
+            itch = tch.view(it)
+            npt = itch.numpy()
+
+            dtype = onnx_dtype_to_np_dtype(el_type)
+            res.append(npt.view(dtype))
+
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_pop()
+        return tuple(res)
 
 
 class InferenceSessionForTorch(_InferenceSession):
@@ -225,33 +305,6 @@ def __init__(
             use_training_api=use_training_api,
         )
 
-        self.TORCH_DTYPE_TO_ONNX_DTYPE = {
-            torch.float16: onnx.TensorProto.FLOAT16,
-            torch.bfloat16: onnx.TensorProto.BFLOAT16,
-            torch.float32: onnx.TensorProto.FLOAT,
-            torch.float64: onnx.TensorProto.DOUBLE,
-            torch.uint32: onnx.TensorProto.UINT32,
-            torch.uint16: onnx.TensorProto.UINT16,
-            torch.uint8: onnx.TensorProto.UINT8,
-            torch.int8: onnx.TensorProto.INT8,
-            torch.int16: onnx.TensorProto.INT16,
-            torch.int32: onnx.TensorProto.INT32,
-            torch.int64: onnx.TensorProto.INT64,
-            torch.bool: onnx.TensorProto.BOOL,
-        }
-
-        self.TORCH_DTYPE_TO_NUMPY_DTYPE = {
-            torch.float16: np.float16,
-            torch.float32: np.float32,
-            torch.float64: np.float64,
-            torch.uint8: np.uint8,
-            torch.int8: np.int8,
-            torch.int16: np.int16,
-            torch.int32: np.int32,
-            torch.int64: np.int64,
-            torch.bool: np.bool_,
-        }
-
     def _get_ortvalues_from_torch_tensors(
         self, tensors: Tuple[torch.Tensor, ...], n_outputs: int
     ) -> Tuple[ORTC.OrtValueVector, List[onnxruntime.OrtDevice]]:
@@ -269,7 +322,7 @@ def _get_ortvalues_from_torch_tensors(
         new_tensors = []
         for tensor in tensors:
             assert isinstance(tensor, self.torch.Tensor), f"Unexpected type {type(tensor)}"
-            dtypes.append(self.TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
+            dtypes.append(onnx_dtype_to_np_dtype(torch_dtype_to_onnx_dtype(tensor.dtype)))
             shapes.append(tensor.size())
             data_ptrs.append(tensor.data_ptr())
             d = tensor.get_device()
diff --git a/onnx_diagnostic/reference/__init__.py b/onnx_diagnostic/reference/__init__.py
index e4db27cc..7a4d7128 100644
--- a/onnx_diagnostic/reference/__init__.py
+++ b/onnx_diagnostic/reference/__init__.py
@@ -1 +1,2 @@
 from .evaluator import ExtendedReferenceEvaluator
+from .ort_evaluator import OnnxruntimeEvaluator
diff --git a/onnx_diagnostic/reference/ort_evaluator.py b/onnx_diagnostic/reference/ort_evaluator.py
new file mode 100644
index 00000000..501017f5
--- /dev/null
+++ b/onnx_diagnostic/reference/ort_evaluator.py
@@ -0,0 +1,420 @@
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import numpy as np
+from onnx import (
+    GraphProto,
+    FunctionProto,
+    ModelProto,
+    NodeProto,
+    TypeProto,
+    ValueInfoProto,
+    helper as oh,
+    load,
+)
+from onnx.defs import onnx_opset_version
+import onnxruntime
+from ..helpers import pretty_onnx, dtype_to_tensor_dtype, string_type, to_array_extended
+from ..ort_session import InferenceSessionForTorch, InferenceSessionForNumpy, _InferenceSession
+
+PROTO = (FunctionProto, ModelProto, GraphProto, NodeProto)
+Proto = Union[FunctionProto, ModelProto, GraphProto, NodeProto]
+
+
+class OnnxruntimeEvaluator:
+    """
+    This class loads an onnx model and the executes one by one the nodes
+    with onnxruntime. This class is mostly meant for debugging.
+
+    :param proto: proto or filaname
+    :param session_options: options
+    :param providers: providers
+    :param nvtx: enable nvidia events
+    :param providers: `None`, `"CPU"`, `"CUDA"` or a list of providers
+    :param graph_optimization_level: see :class:`onnxruntime.SessionOptions`
+    :param log_severity_level: see :class:`onnxruntime.SessionOptions`
+    :param log_verbosity_level: see :class:`onnxruntime.SessionOptions`
+    :param optimized_model_filepath:  see :class:`onnxruntime.SessionOptions`
+    :param disable_aot_function_inlining:  see :class:`onnxruntime.SessionOptions`
+    :param use_training_api: use onnxruntime-traning API
+    :param verbose: verbosity
+    :param local_functions: additional local function
+    :param ir_version: ir verions to use when unknown
+    :param opsets: opsets to use when unknown
+    """
+
+    def __init__(
+        self,
+        proto: Union[str, Proto, "OnnxruntimeEvaluator"],
+        session_options: Optional[onnxruntime.SessionOptions] = None,
+        providers: Optional[Union[str, List[str]]] = None,
+        nvtx: bool = False,
+        enable_profiling: bool = False,
+        graph_optimization_level: Union[onnxruntime.GraphOptimizationLevel, bool] = None,
+        log_severity_level: Optional[int] = None,
+        log_verbosity_level: Optional[int] = None,
+        optimized_model_filepath: Optional[str] = None,
+        disable_aot_function_inlining: Optional[bool] = None,
+        use_training_api: bool = False,
+        verbose: int = 0,
+        local_functions: Optional[
+            Dict[Tuple[str, str], Union[Proto, "OnnxruntimeEvaluator"]]
+        ] = None,
+        ir_version: int = 10,
+        opsets: Optional[Union[int, Dict[str, int]]] = None,
+    ):
+        if isinstance(proto, str):
+            self.proto: Proto = load(proto)
+        elif isinstance(proto, OnnxruntimeEvaluator):
+            assert isinstance(
+                proto.proto, PROTO
+            ), f"Unexpected type for proto.proto {type(proto.proto)}"
+            self.proto = proto.proto
+        else:
+            self.proto = proto
+        assert isinstance(
+            self.proto, PROTO
+        ), f"Unexpected type for self.proto {type(self.proto)}"
+
+        self._cache: Dict[
+            Any, Tuple[Proto, Union["OnnxruntimeEvaluator", _InferenceSession]]  # noqa: UP037
+        ] = {}
+        self.ir_version = ir_version
+        self.opsets = opsets
+        self.session_kwargs: Dict[str, Any] = dict(
+            session_options=session_options,
+            providers=providers,
+            nvtx=nvtx,
+            enable_profiling=enable_profiling,
+            graph_optimization_level=graph_optimization_level,
+            log_severity_level=log_severity_level,
+            log_verbosity_level=log_verbosity_level,
+            optimized_model_filepath=optimized_model_filepath,
+            disable_aot_function_inlining=disable_aot_function_inlining,
+            use_training_api=use_training_api,
+        )
+
+        self.nodes = (
+            [self.proto]
+            if isinstance(self.proto, NodeProto)
+            else (
+                list(
+                    self.proto.graph.node if hasattr(self.proto, "graph") else self.proto.node
+                )
+            )
+        )
+        self.rt_inits_ = (
+            {init.name: to_array_extended(init) for init in self.proto.graph.initializer}
+            if hasattr(self.proto, "graph")
+            else {}
+        )
+        self.rt_nodes_ = self.nodes.copy()
+        self.verbose = verbose
+        self.local_functions: Dict[Tuple[str, str], "OnnxruntimeEvaluator"] = (  # noqa: UP037
+            {(f.domain, f.name): self.__class__(f) for f in self.proto.functions}
+            if hasattr(self.proto, "functions")
+            else {}
+        )
+        if local_functions:
+            self.local_functions.update(local_functions)
+
+    @property
+    def input_names(self) -> List[str]:
+        "Returns input names."
+        if isinstance(self.proto, NodeProto):
+            return self.nodes[0].input
+        return [
+            getattr(o, "name", o)
+            for o in (
+                self.proto.graph.input if hasattr(self.proto, "graph") else self.proto.input
+            )
+        ]
+
+    @property
+    def output_names(self) -> List[str]:
+        "Returns output names."
+        if isinstance(self.proto, NodeProto):
+            return self.nodes[0].output
+        return [
+            getattr(o, "name", o)
+            for o in (
+                self.proto.graph.output if hasattr(self.proto, "graph") else self.proto.output
+            )
+        ]
+
+    @property
+    def input_types(self) -> List[TypeProto]:
+        "Returns input types."
+        if not isinstance(self.proto, (ModelProto, GraphProto)):
+            raise ValueError(f"Cannot guess input types for type {type(self.proto)}")
+        g = self.proto.graph if hasattr(self.proto, "graph") else self.proto
+        return [i.type for i in g.input]
+
+    @property
+    def output_types(self) -> List[TypeProto]:
+        "Returns output types."
+        if not isinstance(self.proto, (ModelProto, GraphProto)):
+            raise ValueError(f"Cannot guess output types for type {type(self.proto)}")
+        g = self.proto.graph if hasattr(self.proto, "graph") else self.proto
+        return [i.type for i in g.output]
+
+    def _log_arg(self, a: Any) -> Any:
+        if isinstance(a, (str, int, float)):
+            return a
+        device = f"D{a.get_device()}:" if hasattr(a, "detach") else ""
+        if hasattr(a, "shape"):
+            if self.verbose < 4:  # noqa: PLR2004
+                return f"{device}{a.dtype}:{a.shape} in [{a.min()}, {a.max()}]"
+            elements = a.ravel().tolist()
+            if len(elements) > 10:  # noqa: PLR2004
+                elements = elements[:10]
+                return f"{device}{a.dtype}:{a.shape}:{','.join(map(str, elements))}..."
+            return f"{device}{a.dtype}:{a.shape}:{elements}"
+        if hasattr(a, "append"):
+            return ", ".join(map(self._log_arg, a))
+        return a
+
+    def _log(self, level: int, pattern: str, *args: Any) -> None:
+        if level < self.verbose:
+            new_args = [self._log_arg(a) for a in args]
+            print(pattern % tuple(new_args))
+
+    def _is_local_function(self, node: NodeProto) -> bool:
+        return (node.domain, node.op_type) in self.local_functions
+
+    def run(
+        self,
+        outputs: Optional[List[str]],
+        feed_inputs: Dict[str, Any],
+        intermediate: bool = False,
+    ) -> Union[Dict[str, Any], List[Any]]:
+        """
+        Runs the model.
+        It only works with numpy arrays.
+
+        :param outputs: required outputs or None for all
+        :param feed_inputs: inputs
+        :param intermediate: returns all output instead of the last ones
+        :return: outputs, as a list if return_all is False,
+            as a dictionary if return_all is True
+        """
+        if outputs is None:
+            outputs = self.output_names
+        results: Dict[str, Any] = self.rt_inits_.copy()
+
+        for k, v in self.rt_inits_.items():
+            self._log(2, " +C %s: %s", k, v)
+        for k, v in feed_inputs.items():
+            self._log(2, " +I %s: %s", k, v)
+            results[k] = v
+
+        for node in self.rt_nodes_:
+            self._log(1, "%s(%s) -> %s", node.op_type, node.input, node.output)
+            for i in node.input:
+                if i != "" and i not in results:
+                    raise RuntimeError(
+                        f"Unable to find input {i!r} in known results {sorted(results)}, "
+                        f"self.rt_inits_ has {sorted(self.rt_inits_)}, "
+                        f"feed_inputs has {sorted(feed_inputs)}."
+                    )
+            inputs = [(results[i] if i != "" else None) for i in node.input]
+            if node.op_type == "If" and node.domain == "":
+                outputs = self._run_if(node, inputs, results)
+            elif self._is_local_function(node):
+                outputs = self._run_local(node, inputs, results)
+            else:
+                outputs = self._run(node, inputs, results)
+            for name, value in zip(node.output, outputs):
+                if name == "":
+                    continue
+                self._log(2, " + %s: %s", name, value)  # type: ignore[arg-type]
+                assert isinstance(name, str), f"unexpected type for name {type(name)}"
+                results[name] = value
+
+        if intermediate:
+            return results
+        output_names = self.output_names
+        for name in output_names:
+            if name == "":
+                continue
+            if name not in results:
+                raise RuntimeError(
+                    f"Unable to find output name {name!r} "
+                    f"in {sorted(results)}, proto is\n{pretty_onnx(self.proto)}"
+                )
+        return [results[name] for name in output_names if name != ""]
+
+    def _make_model_proto(
+        self,
+        nodes: Sequence[NodeProto],
+        vinputs: Sequence[ValueInfoProto],
+        voutputs: Sequence[ValueInfoProto],
+    ) -> ModelProto:
+        onx = oh.make_model(
+            oh.make_graph(nodes, "-", vinputs, voutputs),
+            ir_version=getattr(self.proto, "ir_version", self.ir_version),
+            functions=getattr(self.proto, "functions", None),
+        )
+        del onx.opset_import[:]
+        if hasattr(self.proto, "opset_import"):
+            onx.opset_import.extend(self.proto.opset_import)
+        elif self.opsets:
+            if isinstance(self.opsets, int):
+                onx.opset_import.append(oh.make_opsetid("", self.opsets))
+            else:
+                onx.opset_import.extend(
+                    [oh.make_opsetid(k, v) for k, v in self.opsets.items()]
+                )
+        else:
+            onx.opset_import.append(oh.make_opsetid("", onnx_opset_version()))
+
+        return onx
+
+    def _get_sess(
+        self, node: NodeProto, inputs: List[Any]
+    ) -> Tuple[ModelProto, _InferenceSession]:
+        unique_names = set()
+        vinputs = []
+        for i, it in zip(node.input, inputs):
+            if i == "" or i in unique_names:
+                continue
+            unique_names.add(i)
+            value = oh.make_tensor_value_info(i, dtype_to_tensor_dtype(it.dtype), it.shape)
+            vinputs.append(value)
+
+        # no need to run shape inference
+        voutputs = [oh.make_value_info(o, TypeProto()) for o in node.output]
+        onx = self._make_model_proto([node], vinputs, voutputs)
+
+        cls = (
+            InferenceSessionForNumpy
+            if any(isinstance(i, np.ndarray) for i in inputs)
+            else InferenceSessionForTorch
+        )
+        try:
+            sess = cls(onx, **self.session_kwargs)
+        except (
+            onnxruntime.capi.onnxruntime_pybind11_state.Fail,
+            onnxruntime.capi.onnxruntime_pybind11_state.InvalidGraph,
+            onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument,
+        ) as e:
+            raise RuntimeError(
+                f"Unable to infer a session with inputs\n{string_type(inputs)}"
+                f"\ndue to {e}\n{pretty_onnx(onx)}"
+            ) from e
+        return onx, sess
+
+    def _get_sess_if(
+        self, node: NodeProto, branch: str, inputs: List[Any], context: Dict[str, Any]
+    ) -> Tuple[ModelProto, "OnnxruntimeEvaluator"]:
+        unique_names = set()
+        vinputs = []
+        for i, it in zip(node.input, inputs):
+            if i == "" or i in unique_names:
+                continue
+            unique_names.add(i)
+            value = oh.make_tensor_value_info(i, dtype_to_tensor_dtype(it.dtype), it.shape)
+            vinputs.append(value)
+
+        for i, v in context.items():
+            if i not in unique_names:
+                unique_names.add(i)
+                value = oh.make_tensor_value_info(i, dtype_to_tensor_dtype(v.dtype), v.shape)
+                vinputs.append(value)
+
+        for att in node.attribute:
+            if att.name == branch:
+                g = att.g
+
+        voutputs = g.output
+
+        onx = self._make_model_proto(g.node, vinputs, voutputs)
+        sess = OnnxruntimeEvaluator(
+            onx,
+            local_functions=self.local_functions,
+            verbose=self.verbose,
+            ir_version=self.ir_version,
+            opsets=self.opsets,
+            **self.session_kwargs,
+        )
+        return onx, sess
+
+    def _get_sess_local(
+        self, node: NodeProto, inputs: List[Any]
+    ) -> Tuple[FunctionProto, "OnnxruntimeEvaluator"]:
+        ev = self.local_functions[node.domain, node.op_type]
+        sess = OnnxruntimeEvaluator(
+            ev,
+            local_functions=self.local_functions,
+            verbose=self.verbose,
+            ir_version=self.ir_version,
+            opsets=self.opsets,
+            **self.session_kwargs,
+        )
+        return ev.proto, sess
+
+    def _run(self, node: NodeProto, inputs: List[Any], results: Dict[str, Any]) -> List[Any]:
+        """Runs a node."""
+        types = [(None if a is None else (a.dtype, a.shape)) for a in inputs]
+        key = (id(node), *types)
+        if key in self._cache:
+            sess = self._cache[key][1]
+        else:
+            onx, sess = self._get_sess(node, inputs)
+            self._cache[key] = onx, sess
+
+        feeds = dict(zip(node.input, inputs))
+        if "" in feeds:
+            feeds[""] = np.array([0], dtype=np.float32)
+
+        assert hasattr(sess, "run"), f"Missing method run for type {type(sess)}"
+        outputs = list(sess.run(None, feeds))
+        assert isinstance(outputs, list), f"Unexpected type for outputs {type(outputs)}"
+        return outputs
+
+    def _run_if(
+        self, node: NodeProto, inputs: List[Any], results: Dict[str, Any]
+    ) -> List[Any]:
+        """Runs a node if."""
+        feeds = dict(zip(node.input, inputs))
+        feeds.update(results)
+        if feeds[node.input[0]]:
+            name = "then_branch"
+        else:
+            name = "else_branch"
+
+        key = (id(node), name)
+        if key in self._cache:
+            sess = self._cache[key][1]
+        else:
+            self._cache[key] = onx, sess = self._get_sess_if(node, name, inputs, results)
+
+        assert hasattr(sess, "run"), f"Missing method run for type {type(sess)}"
+        outputs = sess.run(None, feeds)
+        assert isinstance(outputs, list), f"Unexpected type for outputs {type(outputs)}"
+        return outputs
+
+    def _run_local(
+        self, node: NodeProto, inputs: List[Any], results: Dict[str, Any]
+    ) -> List[Any]:
+        """Runs a node."""
+        types = [(None if a is None else (a.dtype, a.shape)) for a in inputs]
+        key = (id(node), *types)
+        if key in self._cache:
+            sess = self._cache[key][1]
+        else:
+            onx, sess = self._get_sess_local(node, inputs)
+            self._cache[key] = onx, sess
+
+        replace = dict(zip(node.input, sess.input_names))
+        assert len(node.input) == len(sess.input_names), (
+            f"Input mismatch: input_names={sess.input_names}, "
+            f"replace={replace}, "
+            f"type(self.proto)={type(self.proto)}, and node=\n{node}"
+        )
+        feeds = {replace[i]: v for i, v in zip(node.input, inputs)}
+        if "" in feeds:
+            feeds[""] = np.array([0], dtype=np.float32)
+
+        assert hasattr(sess, "run"), f"Missing method run for type {type(sess)}"
+        outputs = sess.run(None, feeds)
+        assert isinstance(outputs, list), f"Unexpected type for outputs {type(outputs)}"
+        return outputs