add phi2

xadupre · xadupre · commit 7e5e798a07c8 · 2025-03-30T14:46:50.000+02:00
diff --git a/_doc/examples/plot_export_tiny_phi2.py b/_doc/examples/plot_export_tiny_phi2.py
@@ -0,0 +1,159 @@
+"""
+.. _l-plot-export_tiny_phi2:
+
+Untrained microsoft/phi-2
+=========================
+
+:epkg:`microsoft/phi-2` is not a big models but still quite big
+when it comes to write unittest. Function
+:func:`onnx_diagnostic.torch_models.hghub.get_untrained_model_with_inputs`
+can be used to create a reduced untrained version of a model coming from
+:epkg:`HuggingFace`. It downloads the configuration from the website
+but creates a dummy model with 1 or 2 hidden layers in order to reduce
+the size and get a fast execution. The goal is usually to test
+the export or to compare performance. The relevance does not matter.
+
+Create the dummy model
+++++++++++++++++++++++
+"""
+
+import copy
+import pprint
+import warnings
+import torch
+import onnxruntime
+from onnx_diagnostic import doc
+from onnx_diagnostic.helpers import max_diff, string_diff, string_type
+from onnx_diagnostic.helpers.cache_helper import is_cache_dynamic_registered
+from onnx_diagnostic.helpers.ort_session import make_feeds
+from onnx_diagnostic.torch_export_patches import bypass_export_some_errors
+from onnx_diagnostic.torch_models.hghub import (
+    get_untrained_model_with_inputs,
+)
+
+warnings.simplefilter("ignore")
+
+
+data = get_untrained_model_with_inputs("microsoft/phi-2")
+untrained_model, inputs, dynamic_shapes, config, size, n_weights = (
+    data["model"],
+    data["inputs"],
+    data["dynamic_shapes"],
+    data["configuration"],
+    data["size"],
+    data["n_weights"],
+)
+
+print(f"model {size / 2**10:1.3f} Kb with {n_weights} parameters.")
+# %%
+# The original model has 2.7 billion parameters. It was divided by more than 10.
+# Let's see the configuration.
+print(config)
+
+
+# %%
+# Inputs:
+
+print(string_type(inputs, with_shape=True))
+
+# %%
+# With min/max values.
+print(string_type(inputs, with_shape=True, with_min_max=True))
+
+# %%
+# And the dynamic shapes
+pprint.pprint(dynamic_shapes)
+
+# %%
+# We execute the model to produce expected outputs.
+expected = untrained_model(**copy.deepcopy(inputs))
+print(f"expected: {string_type(expected, with_shape=True, with_min_max=True)}")
+
+
+# %%
+# Export
+# ++++++
+
+
+with bypass_export_some_errors(patch_transformers=True) as modificator:
+
+    # Unnecessary steps but useful in case of an error
+    # We check the cache is registered.
+    assert is_cache_dynamic_registered()
+
+    # We check there is no discrepancies when the cache is applied.
+    d = max_diff(expected, untrained_model(**copy.deepcopy(inputs)))
+    assert (
+        d["abs"] < 1e-5
+    ), f"The model with patches produces different outputs: {string_diff(d)}"
+
+    # Then we export.
+    ep = torch.export.export(
+        untrained_model,
+        (),
+        kwargs=modificator(copy.deepcopy(inputs)),
+        dynamic_shapes=dynamic_shapes,
+        strict=False,  # mandatory for torch==2.6
+    )
+
+    # We check the exported program produces the same results as well.
+    d = max_diff(expected, ep.module()(**copy.deepcopy(inputs)))
+    assert d["abs"] < 1e-5, f"The exported model different outputs: {string_diff(d)}"
+
+# %%
+# Export to ONNX
+# ++++++++++++++
+#
+# The export works. We can export to ONNX now.
+# Patches are still needed because the export
+# applies :meth:`torch.export.ExportedProgram.run_decompositions`
+# may export local pieces of the model again.
+
+with bypass_export_some_errors(patch_transformers=True):
+    epo = torch.onnx.export(
+        ep, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=dynamic_shapes, dynamo=True
+    )
+
+# %%
+# We can save it.
+epo.save("plot_export_tiny_phi2.onnx", external_data=True)
+
+# Or directly get the :class:`onnx.ModelProto`.
+onx = epo.model_proto
+
+
+# %%
+# Discrepancies
+# +++++++++++++
+#
+# The we check the conversion to ONNX.
+# Let's make sure the ONNX model produces the same outputs.
+# It takes flatten inputs.
+
+feeds = make_feeds(onx, copy.deepcopy(inputs), use_numpy=True)
+
+print(f"torch inputs: {string_type(inputs)}")
+print(f"onxrt inputs: {string_type(feeds)}")
+
+# %%
+# We then create a :class:`onnxruntime.InferenceSession`.
+
+sess = onnxruntime.InferenceSession(
+    onx.SerializeToString(), providers=["CPUExecutionProvider"]
+)
+
+# %%
+# Let's run.
+got = sess.run(None, feeds)
+
+# %%
+# And finally the discrepancies.
+
+diff = max_diff(expected, got, flatten=True)
+print(f"onnx discrepancies: {string_diff(diff)}")
+
+# %%
+# It looks good.
+
+# %%
+doc.plot_legend("untrained smaller\nmicrosoft/phi-2", "torch.onnx.export", "green")
diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
@@ -174,6 +174,18 @@ def test_flatten(self):
         d = string_diff(diff)
         self.assertIsInstance(d, str)
 
+    def test_flatten_cache(self):
+        cache = make_dynamic_cache([(torch.ones((5, 6, 5, 6)), torch.ones((5, 6, 5, 6)) + 2)])
+        flat = flatten_object(cache, drop_keys=True)
+        self.assertEqual(string_type(flat), "(T1r4,T1r4)")
+        cache = dict(
+            cache=make_dynamic_cache(
+                [(torch.ones((5, 6, 5, 6)), torch.ones((5, 6, 5, 6)) + 2)]
+            )
+        )
+        flat = flatten_object(cache, drop_keys=True)
+        self.assertEqual(string_type(flat), "#2[T1r4,T1r4]")
+
     @hide_stdout()
     def test_max_diff_verbose(self):
         inputs = (
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -5,12 +5,17 @@
 import transformers.cache_utils
 
 
-def is_cache_dynamic_registered() -> bool:
+def is_cache_dynamic_registered(fast: bool = False) -> bool:
     """
     Tells class :class:`transformers.cache_utils.DynamicCache` can be
     serialized and deserialized. Only then, :func:`torch.export.export`
     can export a model.
+
+    :param fast: if True, do not check the serialization is ok as well
+    :return: result
     """
+    if fast:
+        return transformers.cache_utils.DynamicCache in torch.utils._pytree.SUPPORTED_NODES
     bsize, nheads, slen, dim = 2, 4, 3, 7
     cache = make_dynamic_cache(
         [
diff --git a/onnx_diagnostic/helpers/ort_session.py b/onnx_diagnostic/helpers/ort_session.py
@@ -6,7 +6,8 @@
 from torch._C import _from_dlpack
 import onnxruntime
 from onnxruntime.capi import _pybind_state as ORTC
-from .helper import size_type
+from .cache_helper import is_cache_dynamic_registered
+from .helper import size_type, string_type, flatten_object
 from .onnx_helper import (
     torch_dtype_to_onnx_dtype,
     onnx_dtype_to_np_dtype,
@@ -17,6 +18,34 @@
 DEVICES = {-1: ORTC.OrtDevice(ORTC.OrtDevice.cpu(), ORTC.OrtDevice.default_memory(), 0)}
 
 
+def make_feeds(
+    proto: onnx.ModelProto, inputs: Any, use_numpy: bool = True
+) -> Dict[str, Union[torch.Tensor, np.ndarray]]:
+    """
+    Serializes the inputs to produce feeds expected
+    by :class:`onnxruntime.InferenceSession`.
+
+    :param proto: onnx model
+    :param inputs: any kind of inputs
+    :param use_numpy: if True, converts torch tensors into numpy arrays
+    :return: feeds dictionary
+    """
+    flat = flatten_object(inputs, drop_keys=True)
+    assert (
+        not all(isinstance(obj, torch.Tensor) for obj in flat)
+        or not is_cache_dynamic_registered(fast=True)
+        or len(flat) == len(torch.utils._pytree.tree_flatten(inputs)[0])
+    ), (
+        f"Unexpected number of flattened objects, "
+        f"{string_type(flat, with_shape=True, limit=20)} != "
+        f"{string_type(torch.utils._pytree.tree_flatten(inputs)[0], with_shape=True,limit=20)}"
+    )
+    if use_numpy:
+        flat = [t.detach().cpu().numpy() if isinstance(t, torch.Tensor) else t for t in flat]
+    names = [i.name for i in proto.graph.input]
+    return dict(zip(names, flat))
+
+
 class _InferenceSession:
 
     @classmethod
diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
@@ -68,6 +68,8 @@ def unpatch_module(mod, info: Dict[type, Dict[type, Callable]], verbose: int = 0
 def _register_cache_serialization(verbose: int = 0) -> Dict[str, bool]:
     # Cache serialization: to be moved into appropriate packages
     import torch
+    import transformers
+    import packaging.version as pv
 
     try:
         from transformers.cache_utils import DynamicCache
@@ -108,7 +110,9 @@ def _register_cache_serialization(verbose: int = 0) -> Dict[str, bool]:
     # torch.fx._pytree.register_pytree_flatten_spec(
     #           DynamicCache, _flatten_dynamic_cache_for_fx)
     # so we remove it anyway
-    if DynamicCache in torch.fx._pytree.SUPPORTED_NODES:
+    if DynamicCache in torch.fx._pytree.SUPPORTED_NODES and pv.Version(
+        transformers.__version__
+    ) >= pv.Version("2.7"):
         if verbose:
             print("[_register_cache_serialization] DynamicCache is unregistered first.")
         _unregister(DynamicCache)
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -47,7 +47,7 @@ def _patch_make_causal_mask(
 if sys.version_info[:2] <= (3, 11):
 
     @dataclass
-    class patched_AttentionMaskConverter:
+    class kkpatched_AttentionMaskConverter:
         """
         Patches
         ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
@@ -72,7 +72,7 @@ def _make_causal_mask(
 else:
 
     @dataclass
-    class patched_AttentionMaskConverter:
+    class kkpatched_AttentionMaskConverter:
         """
         Patches
         ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -293,6 +293,9 @@ def get_untrained_model_with_inputs(
         kwargs.update(inputs_kwargs)
 
     model = getattr(transformers, arch)(config)
+    # This line is important. Some models may produce different
+    # outputs even with the same inputs in training mode.
+    model.eval()
     res = fct(model, config, **kwargs)
     res["input_kwargs"] = kwargs
     res["model_kwargs"] = mkwargs