fix bfloat16

xadupre · xadupre · commit 31a923468276 · 2025-03-23T12:03:50.000+01:00
diff --git a/_doc/examples/plot_failing_onnxruntime_evaluator.py b/_doc/examples/plot_failing_onnxruntime_evaluator.py
@@ -74,7 +74,7 @@
 
 ref = OnnxruntimeEvaluator(model, verbose=10)
 feeds = dict(
-    X=torch.rand((3, 4), dtype=torch.blofat16), Y=torch.rand((3, 4), dtype=torch.blofat16)
+    X=torch.rand((3, 4), dtype=torch.bfloat16), Y=torch.rand((3, 4), dtype=torch.bfloat16)
 )
 try:
     ref.run(None, feeds)
diff --git a/_unittests/ut_xrun_doc/test_helpers.py b/_unittests/ut_xrun_doc/test_helpers.py
@@ -19,6 +19,7 @@
     string_signature,
     make_hash,
     onnx_dtype_to_torch_dtype,
+    onnx_dtype_to_np_dtype,
     np_dtype_to_tensor_dtype,
     torch_dtype_to_onnx_dtype,
     from_array_extended,
@@ -213,6 +214,7 @@ def test_size_type_onnx(self):
                     "FLOAT8E4M3FNUZ",
                 }:
                     onnx_dtype_to_torch_dtype(i)
+                    onnx_dtype_to_np_dtype(i)
 
     def test_size_type_numpy(self):
         for dt in {
diff --git a/_unittests/ut_xrun_doc/test_ort_session.py b/_unittests/ut_xrun_doc/test_ort_session.py
@@ -1,6 +1,7 @@
 import unittest
-from typing import Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
+import ml_dtypes
 import onnx
 import onnx.helper as oh
 import torch
@@ -11,6 +12,11 @@
     requires_onnxruntime_training,
     requires_cuda,
 )
+from onnx_diagnostic.helpers import (
+    from_array_extended,
+    onnx_dtype_to_np_dtype,
+    onnx_dtype_to_torch_dtype,
+)
 from onnx_diagnostic.ort_session import (
     InferenceSessionForNumpy,
     InferenceSessionForTorch,
@@ -232,6 +238,66 @@ def test_investigate_onnxruntime_issue_callable_str(self):
             onnx_to_session="cpu_session",
         )
 
+    @classmethod
+    def _get_model_init(cls, itype) -> Tuple[onnx.ModelProto, Dict[str, Any], Tuple[Any, ...]]:
+        dtype = onnx_dtype_to_np_dtype(itype)
+        ttype = onnx_dtype_to_torch_dtype(itype)
+        cst = np.arange(6).astype(dtype)
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("IsNaN", ["x"], ["xi"]),
+                    oh.make_node("IsNaN", ["y"], ["yi"]),
+                    oh.make_node("Cast", ["xi"], ["xii"], to=onnx.TensorProto.INT64),
+                    oh.make_node("Cast", ["yi"], ["yii"], to=onnx.TensorProto.INT64),
+                    oh.make_node("Add", ["xii", "yii"], ["gggg"]),
+                    oh.make_node("Cast", ["gggg"], ["final"], to=itype),
+                ],
+                "dummy",
+                [oh.make_tensor_value_info("x", itype, [None, None])],
+                [oh.make_tensor_value_info("final", itype, [None, None])],
+                [from_array_extended(cst, name="y")],
+            ),
+            opset_imports=[oh.make_opsetid("", 20)],
+            ir_version=10,
+        )
+        onnx.checker.check_model(model)
+        feeds = {"x": cls._range(5, 6).to(ttype)}
+        expected = torch.isnan(feeds["x"]).to(int) + torch.isnan(
+            torch.from_numpy(cst.astype(float))
+        ).to(int)
+        return (model, feeds, (expected.to(ttype),))
+
+    def test_init_numpy_afloat32(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.FLOAT)
+        wrap = InferenceSessionForNumpy(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, {k: v.numpy() for k, v in feeds.items()})
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_numpy_bfloat16(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.BFLOAT16)
+        wrap = InferenceSessionForNumpy(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(
+            None, {k: v.to(float).numpy().astype(ml_dtypes.bfloat16) for k, v in feeds.items()}
+        )
+        self.assertIsInstance(got[0], np.ndarray)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_torch_afloat32(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.FLOAT)
+        wrap = InferenceSessionForTorch(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], torch.Tensor)
+        self.assertEqualArray(expected[0], got[0])
+
+    def test_init_torch_bfloat16(self):
+        model, feeds, expected = self._get_model_init(onnx.TensorProto.BFLOAT16)
+        wrap = InferenceSessionForTorch(model, providers="cpu", graph_optimization_level=False)
+        got = wrap.run(None, feeds)
+        self.assertIsInstance(got[0], torch.Tensor)
+        self.assertEqualArray(expected[0], got[0])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers.py b/onnx_diagnostic/helpers.py
@@ -889,6 +889,51 @@ def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
     )
 
 
+def onnx_dtype_to_np_dtype(itype: int) -> Any:
+    """
+    Converts an onnx type into a to numpy dtype.
+    That includes :epkg:`ml_dtypes` dtypes.
+
+    :param to: onnx dtype
+    :return: numpy dtype
+    """
+    if itype == TensorProto.FLOAT:
+        return np.float32
+    if itype == TensorProto.FLOAT16:
+        return np.float16
+    if itype == TensorProto.BFLOAT16:
+        import ml_dtypes
+
+        return ml_dtypes.bfloat16
+    if itype == TensorProto.DOUBLE:
+        return np.float64
+    if itype == TensorProto.INT32:
+        return np.int32
+    if itype == TensorProto.INT64:
+        return np.int64
+    if itype == TensorProto.UINT32:
+        return np.uint32
+    if itype == TensorProto.UINT64:
+        return np.uint64
+    if itype == TensorProto.BOOL:
+        return np.bool
+    if itype == TensorProto.INT16:
+        return np.int16
+    if itype == TensorProto.UINT16:
+        return np.uint16
+    if itype == TensorProto.INT8:
+        return np.int16
+    if itype == TensorProto.UINT8:
+        return np.uint16
+    if itype == TensorProto.COMPLEX64:
+        return np.complex64
+    if itype == TensorProto.COMPLEX128:
+        return np.complex128
+    raise NotImplementedError(
+        f"Unable to convert onnx type {onnx_dtype_name(itype)} to torch.type."
+    )
+
+
 def torch_dtype_to_onnx_dtype(to: "torch.dtype") -> int:  # noqa: F821
     """
     Converts a torch dtype into a onnx element type.
diff --git a/onnx_diagnostic/ort_session.py b/onnx_diagnostic/ort_session.py
@@ -6,6 +6,13 @@
 from torch._C import _from_dlpack
 import onnxruntime
 from onnxruntime.capi import _pybind_state as ORTC
+from .helpers import (
+    torch_dtype_to_onnx_dtype,
+    onnx_dtype_to_np_dtype,
+    np_dtype_to_tensor_dtype,
+    onnx_dtype_name,
+    size_type,
+)
 
 DEVICES = {-1: ORTC.OrtDevice(ORTC.OrtDevice.cpu(), ORTC.OrtDevice.default_memory(), 0)}
 
@@ -48,7 +55,14 @@ def __init__(
     ):
         # onnxruntime is importing when needed as it takes a
         # couple of seconds if it contains CUDA EP.
+        can_use_training_api = True
         if isinstance(sess, (onnx.ModelProto, str)):
+            if isinstance(sess, onnx.ModelProto):
+                for i in sess.graph.initializer:
+                    if i.data_type >= onnx.TensorProto.BFLOAT16:
+                        # Cannot use training api as it relies too much on numpy.
+                        can_use_training_api = False
+                        break
             assert session_options is None or (
                 providers is None
                 and graph_optimization_level is None
@@ -113,7 +127,7 @@ def __init__(
         if log_verbosity_level is not None:
             self.run_options.log_verbosity_level = log_verbosity_level
 
-        self.use_training_api = (
+        self.use_training_api = can_use_training_api and (
             self.has_onnxruntime_training() if use_training_api is None else use_training_api
         )
 
@@ -176,7 +190,76 @@ def run(
         self, output_names: Optional[List[str]], feeds: Dict[str, npt.ArrayLike]
     ) -> List[npt.ArrayLike]:
         """Calls :meth:`onnxruntime.InferenceSession.run`."""
-        return self.sess.run(output_names, feeds)
+        if any(
+            (np_dtype_to_tensor_dtype(v.dtype) >= onnx.TensorProto.BFLOAT16)
+            for v in feeds.values()
+        ):
+            # bfloat16 not supported by onnxruntime
+            return self.run_dlpack(output_names, feeds)
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_push("run")
+        res = self.sess.run(output_names, feeds)
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_pop()
+        return res
+
+    def run_dlpack(
+        self, output_names: Optional[List[str]], feeds: Dict[str, np.ndarray]
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Same as :meth:`onnxruntime.InferenceSession.run` except that
+        feeds is a dictionary of :class:`np.ndarray`.
+        The output device is CPU even if the outputs are on CUDA.
+        """
+        new_feeds = {}
+        for k, v in feeds.items():
+            new_feeds[k] = ORTC.OrtValue.ortvalue_from_numpy_with_onnx_type(
+                v, np_dtype_to_tensor_dtype(v.dtype)
+            )
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_push("run_with_ort_values")
+        ort_outputs = self.sess._sess.run_with_ort_values(
+            new_feeds, output_names or self.output_names, self.run_options
+        )
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_pop()
+        pth_outputs = self._ortvalues_to_numpy_tensor(ort_outputs)
+        return pth_outputs
+
+    def _ortvalues_to_numpy_tensor(
+        self,
+        ortvalues: Union[List[ORTC.OrtValue], ORTC.OrtValueVector],
+    ) -> Tuple[np.ndarray, ...]:
+        if len(ortvalues) == 0:
+            return tuple()
+
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_push("_ortvalues_to_numpy_tensor")
+        res = []
+        for i in range(len(ortvalues)):
+            if not ortvalues[i].has_value():
+                res.append(None)
+                continue
+
+            el_type = ortvalues[i].element_type()
+            if el_type < onnx.TensorProto.BFLOAT16:
+                res.append(np.from_dlpack(ortvalues[i]))
+                continue
+
+            # no easy conversion, let's use torch
+            tch = torch.from_dlpack(ortvalues[i].to_dlpack())
+            size = size_type(el_type)
+            assert size == 2, f"Not implemented for type {onnx_dtype_name(el_type)}"
+            it = torch.uint16
+            itch = tch.view(it)
+            npt = itch.numpy()
+
+            dtype = onnx_dtype_to_np_dtype(el_type)
+            res.append(npt.view(dtype))
+
+        if self.nvtx:
+            self.torch.cuda.nvtx.range_pop()
+        return tuple(res)
 
 
 class InferenceSessionForTorch(_InferenceSession):
@@ -225,33 +308,6 @@ def __init__(
             use_training_api=use_training_api,
         )
 
-        self.TORCH_DTYPE_TO_ONNX_DTYPE = {
-            torch.float16: onnx.TensorProto.FLOAT16,
-            torch.bfloat16: onnx.TensorProto.BFLOAT16,
-            torch.float32: onnx.TensorProto.FLOAT,
-            torch.float64: onnx.TensorProto.DOUBLE,
-            torch.uint32: onnx.TensorProto.UINT32,
-            torch.uint16: onnx.TensorProto.UINT16,
-            torch.uint8: onnx.TensorProto.UINT8,
-            torch.int8: onnx.TensorProto.INT8,
-            torch.int16: onnx.TensorProto.INT16,
-            torch.int32: onnx.TensorProto.INT32,
-            torch.int64: onnx.TensorProto.INT64,
-            torch.bool: onnx.TensorProto.BOOL,
-        }
-
-        self.TORCH_DTYPE_TO_NUMPY_DTYPE = {
-            torch.float16: np.float16,
-            torch.float32: np.float32,
-            torch.float64: np.float64,
-            torch.uint8: np.uint8,
-            torch.int8: np.int8,
-            torch.int16: np.int16,
-            torch.int32: np.int32,
-            torch.int64: np.int64,
-            torch.bool: np.bool_,
-        }
-
     def _get_ortvalues_from_torch_tensors(
         self, tensors: Tuple[torch.Tensor, ...], n_outputs: int
     ) -> Tuple[ORTC.OrtValueVector, List[onnxruntime.OrtDevice]]:
@@ -269,7 +325,7 @@ def _get_ortvalues_from_torch_tensors(
         new_tensors = []
         for tensor in tensors:
             assert isinstance(tensor, self.torch.Tensor), f"Unexpected type {type(tensor)}"
-            dtypes.append(self.TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
+            dtypes.append(onnx_dtype_to_np_dtype(torch_dtype_to_onnx_dtype(tensor.dtype)))
             shapes.append(tensor.size())
             data_ptrs.append(tensor.data_ptr())
             d = tensor.get_device()

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@`
`74`	`74`
`75`	`75`	`ref = OnnxruntimeEvaluator(model, verbose=10)`
`76`	`76`	`feeds = dict(`
`77`		`- X=torch.rand((3, 4), dtype=torch.blofat16), Y=torch.rand((3, 4), dtype=torch.blofat16)`
	`77`	`+ X=torch.rand((3, 4), dtype=torch.bfloat16), Y=torch.rand((3, 4), dtype=torch.bfloat16)`
`78`	`78`	`)`
`79`	`79`	`try:`
`80`	`80`	`ref.run(None, feeds)`