example

xadupre · xadupre · commit fc664f8c4152 · 2025-06-09T13:01:38.000+02:00
diff --git a/_doc/technical/plot_layer_norm_discrepancies.py b/_doc/technical/plot_layer_norm_discrepancies.py
@@ -6,22 +6,26 @@
 :ref:`l-plot-parallelized-reduction`, reduction operations
 are sensitive to parallelization.
 
-We consider a small model including a layer normalization
-followed by a matrix multiplication and we show that replacing
-a kernel by another one may significantly impact the output.
+Methodology
++++++++++++
+
+We consider a simple model with a LayerNormalization followed by a MatMul.
+Each operator can be run with :epkg:`onnxruntime` or :epkg:`pytorch`.
+We compare the four combinations.
 
 The model
 +++++++++
 """
 
 import itertools
+import numpy as np
 import pandas
 import onnx
 import onnx.helper as oh
 import onnxruntime
 import torch
 from onnx_array_api.plotting.graphviz_helper import plot_dot
-from onnx_diagnostic.doc import rotate_align, save_fig
+from onnx_diagnostic.doc import rotate_align, save_fig, plot_histogram, title
 from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import max_diff, string_diff, string_type
 from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name, onnx_dtype_to_np_dtype
@@ -80,6 +84,8 @@ def make_feeds(last_dim: int):
 
 
 def cast_feeds(itype, provider, feeds):
+    ttype = onnx_dtype_to_torch_dtype(itype)
+    np_dtype = onnx_dtype_to_np_dtype(itype)
     np_feeds = {k: v.detach().numpy() for k, v in feeds.items()}
     if provider == "CUDA":
         if not torch.cuda.is_available():
@@ -102,8 +108,6 @@ def cast_feeds(itype, provider, feeds):
 baseline = {}
 
 for provider, itype in itertools.product(["CPU", "CUDA"], [TFLOAT, TFLOAT16]):
-    ttype = onnx_dtype_to_torch_dtype(itype)
-    np_dtype = onnx_dtype_to_np_dtype(itype)
     tch_feeds, ort_feeds = cast_feeds(itype, provider, feeds)
     if tch_feeds is None:
         continue
@@ -156,6 +160,22 @@ def cast_feeds(itype, provider, feeds):
 # Let's see which operator is responsible for them,
 # *LayerNormalization* or *MatMul*.
 
+# %%
+# Distribution of the results
+# +++++++++++++++++++++++++++
+
+tensor = baseline[TFLOAT16, "CPU", "ort"][0].ravel().astype(np.float32)
+print(pandas.DataFrame({"expected": tensor}).describe())
+
+# %%
+# Histogram.
+
+save_fig(
+    title(plot_histogram(tensor), "Distribution of the computed results"),
+    "plot_layer_norm_discrepancies_hist.png",
+)
+
+
 # %%
 # The discrepancies come from?
 # ++++++++++++++++++++++++++++
@@ -165,19 +185,18 @@ def cast_feeds(itype, provider, feeds):
 data = []
 
 for mod, provider, itype in itertools.product(
-    ["ORT-TORCH", "TORCH-ORT"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
+    ["ORT-ORT", "ORT-TORCH", "TORCH-ORT", "TORCH-TORCH"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
 ):
     ttype = onnx_dtype_to_torch_dtype(itype)
     np_dtype = onnx_dtype_to_np_dtype(itype)
     tch_feeds, _ = cast_feeds(itype, provider, feeds)
     if tch_feeds is None:
         continue
 
+    ker1, ker2 = mod.split("-")
     custom_kernels = (
-        {("", "LayerNormalization"): LayerNormalizationOrt}
-        if mod == "ORT-TORCH"
-        else {("", "MatMul"): MatMulOrt}
-    )
+        {("", "LayerNormalization"): LayerNormalizationOrt} if ker1 == "ORT" else {}
+    ) | ({("", "MatMul"): MatMulOrt} if ker2 == "ORT" else {})
 
     model = get_model(itype)
     print()
@@ -206,7 +225,7 @@ def cast_feeds(itype, provider, feeds):
     )
 
 # %%
-df = pandas.DataFrame(data).set_index(["model", "provider", "dtype"])
+df = pandas.DataFrame(data).set_index(["dtype", "provider", "model"])
 df = df.sort_index()
 print(df)
 
@@ -216,8 +235,17 @@ def cast_feeds(itype, provider, feeds):
 save_fig(
     rotate_align(
         df[["diff_ort", "diff_torch"]].plot.bar(
-            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
+            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B",
+            figsize=(10, 4),
         )
     ),
     "plot_layer_norm_discrepancies_2.png",
 )
+
+# %%
+# Conclusion
+# ++++++++++
+#
+# :epkg:`torch` seems able to replicate the same results if the same computation
+# is run multiple times. :epkg:`onnxruntime` is only able to do that on CUDA.
+# With float16 and CUDA, LayerNormalization seems to introduce some discrepancies.
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
@@ -23,6 +23,12 @@
 
 With :math:`\\mathbb{E}X = mean(X)`,
 :math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`.
+
+Methodology
++++++++++++
+
+**Permutation should not change the average.**
+
 We draw 128 random permutations of X. The average or mean should not change.
 And the normalized vector should have the same values. In the first case, we compute
 the difference between the highest and the lowest values obtained for the average.
diff --git a/k.py b/k.py
diff --git a/onnx_diagnostic/doc.py b/onnx_diagnostic/doc.py
@@ -1,3 +1,7 @@
+from typing import Optional
+import numpy as np
+
+
 def reset_torch_transformers(gallery_conf, fname):
     "Resets torch dynamo for :epkg:`sphinx-gallery`."
     import matplotlib.pyplot as plt
@@ -48,3 +52,27 @@ def save_fig(ax, name: str):
     fig = ax.get_figure()
     fig.savefig(name)
     return ax
+
+
+def title(ax: "plt.axes", title: str) -> "plt.axes":  # noqa: F821
+    "Adds a title to axes and returns them."
+    ax.set_title(title)
+    return ax
+
+
+def plot_histogram(
+    tensor: np.ndarray,
+    ax: Optional["plt.axes"] = None,  # noqa: F821
+    bins: int = 30,
+    color: str = "orange",
+    alpha: float = 0.7,
+) -> "plt.axes":  # noqa: F821
+    "Computes the distribution for a tensor."
+    if ax is None:
+        import matplotlib.pyplot as plt
+
+        ax = plt.gca()
+        ax.cla()
+    ax.hist(tensor, bins=30, color="orange", alpha=0.7)
+    ax.set_yscale("log")
+    return ax
diff --git a/onnx_diagnostic/helpers/doc_helper.py b/onnx_diagnostic/helpers/doc_helper.py
@@ -1,3 +1,4 @@
+import os
 from typing import Dict, Optional, Tuple
 import onnx
 import onnx.helper as oh
@@ -6,14 +7,25 @@
 from .torch_helper import onnx_dtype_to_torch_dtype, torch_dtype_to_onnx_dtype
 from .ort_session import InferenceSessionForTorch
 
+_SAVED = []
+_SAVE_OPTIMIZED_MODEL_ = int(os.environ.get("DUMP_ONNX", "0"))
+
+
+def _get_model_name(op_name: str, provider: str) -> Optional[str]:
+    if _SAVE_OPTIMIZED_MODEL_:
+        name = f"dump_doc_layer_norm_{provider}_{len(_SAVED)}.onnx"
+        _SAVED.append(name)
+        return name
+    return None
+
 
 class LayerNormalizationOrt(OpRunKernel):
     "LayerNormalization with onnxruntime"
 
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -70,7 +82,11 @@ def _make_model(self, itype: int, rank: int, has_bias: bool) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(layer_model, providers=[provider])
+        return InferenceSessionForTorch(
+            layer_model,
+            optimized_model_filepath=_get_model_name("layer_norm", provider),
+            providers=[provider],
+        )
 
     def run(self, x, scale, bias=None):
         itype = torch_dtype_to_onnx_dtype(x.dtype)
@@ -94,7 +110,7 @@ class MatMulOrt(OpRunKernel):
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -127,7 +143,11 @@ def _make_model(self, itype: int, ranka: int, rankb: int) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(model, providers=[provider])
+        return InferenceSessionForTorch(
+            model,
+            optimized_model_filepath=_get_model_name("matmul", provider),
+            providers=[provider],
+        )
 
     def run(self, a, b):
         itype = torch_dtype_to_onnx_dtype(a.dtype)