From 95881048dd9e72799b1a4dc6996b80295e65c3e8 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 11:21:16 +0200
Subject: [PATCH 1/9] add argmax

---
 onnx_diagnostic/helpers/helper.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 51a1c054..784fc89f 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -1181,6 +1181,7 @@ def max_diff(
                 if exp_cpu.size == got_cpu.size
                 else (np.inf, np.inf, np.inf, 0, np.inf)
             )
+            argm = None
         else:
             abs_diff, rel_diff, sum_diff, n_diff, nan_diff = (
                 float(diff.max()),
@@ -1189,6 +1190,7 @@ def max_diff(
                 float(diff.size),
                 float(ndiff.sum()),
             )
+            argm = tuple(map(int, np.unravel_index(diff.argmax(), diff.shape)))
         if verbose >= 10 and (abs_diff >= 10 or rel_diff >= 10):
             # To understand the value it comes from.
             if debug_info:
@@ -1219,7 +1221,9 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res = dict(abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff)
+        res = dict(
+            abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
+        )
         if hist:
             if isinstance(hist, bool):
                 hist = np.array([0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], dtype=diff.dtype)
@@ -1284,8 +1288,10 @@ def max_diff(
                 float(diff.numel()),
                 float(ndiff.sum()),
             )
+            argm = tuple(map(int, torch.unravel_index(diff.argmax(), diff.shape)))
         elif got_cpu.numel() == exp_cpu.numel():
             abs_diff, rel_diff, sum_diff, n_diff, nan_diff = (0.0, 0.0, 0.0, 0.0, 0.0)
+            argm = None
         else:
             abs_diff, rel_diff, sum_diff, n_diff, nan_diff = (
                 np.inf,
@@ -1294,6 +1300,7 @@ def max_diff(
                 np.inf,
                 np.inf,
             )
+            argm = None
 
         if verbose >= 10 and (abs_diff >= 10 or rel_diff >= 10):
             # To understand the value it comes from.
@@ -1325,7 +1332,9 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res = dict(abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff)
+        res = dict(
+            abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
+        )
         if hist:
             if isinstance(hist, bool):
                 hist = torch.tensor(
@@ -1478,6 +1487,8 @@ def string_diff(diff: Dict[str, Any]) -> str:
                 rows.append(f"#{v}{k}")
         suffix = "-".join(rows)
         suffix = f"/{suffix}"
+    if "argm" in diff:
+        suffix += f", argmax={diff['argm']}"
     if diff.get("dnan", None):
         if diff["abs"] == 0 or diff["rel"] == 0:
             return f"abs={diff['abs']}, rel={diff['rel']}, dnan={diff['dnan']}{suffix}"

From c4bfe8c7f063b56151682839d5a7b491e69ad4fb Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 12:04:05 +0200
Subject: [PATCH 2/9] ut

---
 _unittests/ut_helpers/test_bench_run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_unittests/ut_helpers/test_bench_run.py b/_unittests/ut_helpers/test_bench_run.py
index c0aaa98f..ac9579e3 100644
--- a/_unittests/ut_helpers/test_bench_run.py
+++ b/_unittests/ut_helpers/test_bench_run.py
@@ -105,14 +105,14 @@ def test_make_configs_replace(self):
     def test_max_diff(self):
         self.assertEqual(
             max_diff(torch.Tensor([1, 2]), torch.Tensor([1, 2])),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
         )
         self.assertEqual(
             max_diff(
                 (torch.Tensor([1, 2]),),
                 (torch.Tensor([1, 2])),
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
         )
         self.assertEqual(
             max_diff(

From da83cea66890185fb697dcec9ce779232bf5cfe5 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 14:06:28 +0200
Subject: [PATCH 3/9] fix git

---
 _unittests/ut_helpers/test_helper.py |  2 +-
 onnx_diagnostic/helpers/helper.py    | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
index faa09fc6..b42ed072 100644
--- a/_unittests/ut_helpers/test_helper.py
+++ b/_unittests/ut_helpers/test_helper.py
@@ -245,7 +245,7 @@ def test_max_diff_hist_array_string_diff(self):
         diff = max_diff(x, y, hist=True)
         s = string_diff(diff)
         self.assertEndsWith(
-            "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0", s
+            "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0,amax=2,1", s
         )
 
     def test_max_diff_hist_tensor(self):
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 784fc89f..47e8f702 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -2,7 +2,7 @@
 import enum
 import inspect
 from dataclasses import is_dataclass, fields
-from typing import Any, Callable, Dict, List, Optional, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 import numpy as np
 
 
@@ -872,7 +872,7 @@ def max_diff(
     _index: int = 0,
     allow_unique_tensor_with_list_of_one_element: bool = True,
     hist: Optional[Union[bool, List[float]]] = None,
-) -> Dict[str, float]:
+) -> Dict[str, Union[float, str, int, Tuple[int, ...]]]:
     """
     Returns the maximum discrepancy.
 
@@ -1221,7 +1221,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res = dict(
+        res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1332,7 +1332,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res = dict(
+        res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1488,7 +1488,12 @@ def string_diff(diff: Dict[str, Any]) -> str:
         suffix = "-".join(rows)
         suffix = f"/{suffix}"
     if "argm" in diff:
-        suffix += f", argmax={diff['argm']}"
+        sa = (
+            ",".join(map(str, diff["argm"]))
+            if isinstance(diff["argm"], tuple)
+            else str(diff["argm"])
+        )
+        suffix += f",amax={sa}"
     if diff.get("dnan", None):
         if diff["abs"] == 0 or diff["rel"] == 0:
             return f"abs={diff['abs']}, rel={diff['rel']}, dnan={diff['dnan']}{suffix}"

From 83a272cde972c465b4221987cf9e88a478cca37d Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 15:20:18 +0200
Subject: [PATCH 4/9] fix issues

---
 onnx_diagnostic/helpers/helper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 47e8f702..23a8154f 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -872,7 +872,7 @@ def max_diff(
     _index: int = 0,
     allow_unique_tensor_with_list_of_one_element: bool = True,
     hist: Optional[Union[bool, List[float]]] = None,
-) -> Dict[str, Union[float, str, int, Tuple[int, ...]]]:
+) -> Dict[str, Union[float, int, Tuple[int, ...]]]:
     """
     Returns the maximum discrepancy.
 
@@ -1221,7 +1221,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict(
+        res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1332,7 +1332,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict(
+        res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:

From 0e5e11de0e891713deaaadd62fa334072e63b593 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 16:11:21 +0200
Subject: [PATCH 5/9] mypy

---
 onnx_diagnostic/ext_test_case.py  | 4 +++-
 onnx_diagnostic/helpers/helper.py | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
index 9662dc74..c7c55c61 100644
--- a/onnx_diagnostic/ext_test_case.py
+++ b/onnx_diagnostic/ext_test_case.py
@@ -1195,7 +1195,9 @@ def assert_onnx_disc(
         if verbose:
             print(f"[{vname}] diff {string_diff(diff)}")
         assert (
-            not numpy.isnan(diff["abs"])
+            isinstance(diff["abs"], float)
+            and isinstance(diff["rel"], float)
+            and not numpy.isnan(diff["abs"])
             and diff["abs"] <= atol
             and not numpy.isnan(diff["rel"])
             and diff["rel"] <= rtol
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 23a8154f..818a3ed0 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -1221,7 +1221,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict(
+        res: Dict[str, float] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1332,7 +1332,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict(
+        res: Dict[str, float] = dict(
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:

From a41b3d1002516debdb63e85d94f6ca261c785aaf Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 17:53:12 +0200
Subject: [PATCH 6/9] add one gallery

---
 .github/workflows/check-release.yml           |   1 +
 .gitignore                                    |   1 +
 _doc/conf.py                                  |   2 +
 _doc/index.rst                                |   1 +
 _doc/technical/README.txt                     |   2 +
 _doc/technical/plot_parallelized_reduction.py | 198 ++++++++++++++++++
 .../test_documentation_technical.py           |  97 +++++++++
 _unittests/ut_xrun_doc/test_unit_test.py      |   6 +-
 clean_onnx.sh                                 |  19 ++
 onnx_diagnostic/export/validate.py            |   5 +-
 onnx_diagnostic/helpers/helper.py             |  14 +-
 pyproject.toml                                |   2 +
 12 files changed, 339 insertions(+), 9 deletions(-)
 create mode 100644 _doc/technical/README.txt
 create mode 100644 _doc/technical/plot_parallelized_reduction.py
 create mode 100644 _unittests/ut_xrun_doc/test_documentation_technical.py

diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml
index a179377b..8bf5e057 100644
--- a/.github/workflows/check-release.yml
+++ b/.github/workflows/check-release.yml
@@ -74,6 +74,7 @@ jobs:
           echo _unittests/ >> .git/info/sparse-checkout
           echo _doc/examples/ >> .git/info/sparse-checkout
           echo _doc/recipes/ >> .git/info/sparse-checkout
+          echo _doc/technical/ >> .git/info/sparse-checkout
           echo pyproject.toml >> .git/info/sparse-checkout
           echo requirements-dev.txt >> .git/info/sparse-checkout
           git pull origin main
diff --git a/.gitignore b/.gitignore
index ce041d2e..2b716f95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ prof
 plot_*.txt
 _doc/auto_examples/*
 _doc/auto_recipes/*
+_doc/auto_technical/*
 _doc/sg_execution_times.rst
 _doc/examples/_cache/*
 _doc/examples/dump_models/*
diff --git a/_doc/conf.py b/_doc/conf.py
index 77daebca..7936a4fd 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -156,11 +156,13 @@ def linkcode_resolve(domain, info):
     "examples_dirs": [
         os.path.join(os.path.dirname(__file__), "examples"),
         os.path.join(os.path.dirname(__file__), "recipes"),
+        os.path.join(os.path.dirname(__file__), "technical"),
     ],
     # path where to save gallery generated examples
     "gallery_dirs": [
         "auto_examples",
         "auto_recipes",
+        "auto_technical",
     ],
     # no parallelization to avoid conflict with environment variables
     "parallel": 1,
diff --git a/_doc/index.rst b/_doc/index.rst
index f2f374bf..43321cdc 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -39,6 +39,7 @@ It also implements tools to investigate, validate exported models (ExportedProgr
     cmds/index
     auto_examples/index
     auto_recipes/index
+    auto_technical/index
 
 .. toctree::
     :maxdepth: 1
diff --git a/_doc/technical/README.txt b/_doc/technical/README.txt
new file mode 100644
index 00000000..d9b5a645
--- /dev/null
+++ b/_doc/technical/README.txt
@@ -0,0 +1,2 @@
+Technical Details
+=================
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
new file mode 100644
index 00000000..79a0b8d7
--- /dev/null
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -0,0 +1,198 @@
+"""
+Reproducible Parallelized Reduction is difficult
+================================================
+
+A reduction is a frequent operation in neural network. It appears in layer normalization,
+softmax. Because of the float precision, the result of the computation
+changes based on the order of the elements. The following examples show the variation
+based on different hypothesis on the vector distribution.
+We consider a vector :math:`X = (x_1, ..., x_n)`.
+It computes the average:
+
+.. math::
+
+    mean(X) = \\frac{\\sum_{i=1}^n x_i}{n}
+
+Or the normalization of the vector:
+
+.. math::
+
+    norm(X)_i = \\frac{ X_i  - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}}
+
+We draw 128 random permutation of X. The average or mean should not change.
+And the normalized vector should have the same value. In the first case, we compute
+the difference between the highest and the lowest values obtained for the average.
+In the second case, we look for the maximum difference between the original normalized
+vector and the permuted one (both sorted).
+
+The computation code
+++++++++++++++++++++
+"""
+
+import itertools
+from tqdm import tqdm
+import numpy as np
+import pandas
+
+DATA = []
+
+
+def str_dtype(dtype):
+    """Displays numpy dtype in a nicer way."""
+    if dtype == np.float64:
+        return "fp64"
+    if dtype == np.float32:
+        return "fp32"
+    if dtype == np.float16:
+        return "fp16"
+    raise ValueError(f"Unexpected value {dtype}")
+
+
+def layer_norm(a, eps=1e-6):
+    """
+    Normalized the vector a.
+    The computation is done in float32 or float64.
+    """
+    ctype = np.float32 if a.dtype == np.float16 else a.dtype
+    a32 = a.astype(ctype)
+    m = a32.mean(axis=-1, keepdims=True)
+    c = a32 - m
+    va = np.sqrt((c * c).mean(axis=-1, keepdims=True))
+    va += eps
+    return (c / va).astype(a.dtype)
+
+
+def compute(values, fct):
+    """
+    Compare the results of function ``fct`` on a sample.
+    Loops over multiple sizes, dtypes. Tries 128 times.
+    """
+
+    def make_value(base, value):
+        if value.size > 1:
+            return np.abs(np.sort(base) - np.sort(value)).max()
+        return value
+
+    sizes = [2, 4, 8, 16, 512, 1024, 2048, 4096, 8192]
+    dtypes = [np.float64, np.float32, np.float16]
+    N = list(range(128))
+    exps = list(itertools.product(sizes, dtypes, N))
+    data = []
+    ech = None
+    for size, dtype, n in tqdm(exps):
+        if n == 0:
+            ech = values[:size].astype(dtype)
+            base = fct(ech)
+            assert base.dtype == ech.dtype
+            obs = dict(
+                n=n, size=size, dtype=str_dtype(ech.dtype), value=make_value(base, fct(ech))
+            )
+            data.append(obs)
+
+        if n == 1:
+            new_ech = np.sort(ech)
+        elif n == 2:
+            new_ech = np.sort(ech)[::-1]
+        else:
+            new_ech = np.random.permutation(ech)
+        assert new_ech.dtype == ech.dtype
+        assert new_ech.shape == ech.shape
+        obs = dict(
+            n=n + 1,
+            size=size,
+            dtype=str_dtype(new_ech.dtype),
+            value=make_value(base, fct(new_ech)),
+        )
+        data.append(obs)
+
+    df = pandas.DataFrame(data)
+    agg = df.drop("n", axis=1).groupby(["dtype", "size"], as_index=False).agg(["min", "max"])
+    agg["value", "delta"] = agg["value", "max"] - agg["value", "min"]
+    piv = agg.pivot(index="size", columns="dtype", values=("value", "delta"))
+    return piv
+
+
+# %%
+# Normal Law
+# ++++++++++
+#
+# Let's see what it returns an on random sample following a normal law.
+# First the average.
+
+values = np.random.randn(4096)
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "normal"
+print(mean)
+
+# %%
+# Then the layer normalization.
+
+ln = compute(values, layer_norm)
+ln["name"] = "normal"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Fixed values
+# ++++++++++++
+#
+# We try a fixed vector with one very high value and all the others are small.
+
+values[:] = -1e-4
+values[::128] = 100
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "fixed"
+print(mean)
+
+
+ln = compute(values, layer_norm)
+ln["name"] = "fixed"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Pareto Distribution
+# +++++++++++++++++++
+#
+# A law with a long tail.
+
+values = np.random.pareto(1, (4096,))
+print(values)
+
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "normal"
+print(mean)
+
+
+ln = compute(values, layer_norm)
+ln["name"] = "pareto"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Summary
+# +++++++
+#
+# We consider the maximum difference obtained for any sample size.
+
+print(DATA)
+df = pandas.DataFrame(DATA).set_index("name")
+print(df)
+
+# %%
+# Visually.
+
+ax = df.plot.bar(logy=True)
+fig = ax.get_figure()
+fig.savefig("plot_parallelized_reduction.png")
+
+# %%
+# In a deep neural network
+# ++++++++++++++++++++++++
+#
+# Some of the vector have 500 values, 16x32x1024x1024. A layer normalization
+# does 16x32x1024 ~ 2M reductions, over 20 layers.
+# When a deep neural network is computed with a difference code,
+# doing a different parallelization (GPU/CPU for example),
+# the order of the reduction may change and therefore,
+# some errors will appear and propagate.
diff --git a/_unittests/ut_xrun_doc/test_documentation_technical.py b/_unittests/ut_xrun_doc/test_documentation_technical.py
new file mode 100644
index 00000000..7ed73e87
--- /dev/null
+++ b/_unittests/ut_xrun_doc/test_documentation_technical.py
@@ -0,0 +1,97 @@
+import unittest
+import os
+import sys
+import importlib.util
+import subprocess
+import time
+from onnx_diagnostic import __file__ as onnx_diagnostic_file
+from onnx_diagnostic.ext_test_case import ExtTestCase, is_windows, ignore_errors
+
+
+VERBOSE = 0
+ROOT = os.path.realpath(os.path.abspath(os.path.join(onnx_diagnostic_file, "..", "..")))
+
+
+def import_source(module_file_path, module_name):
+    if not os.path.exists(module_file_path):
+        raise FileNotFoundError(module_file_path)
+    module_spec = importlib.util.spec_from_file_location(module_name, module_file_path)
+    if module_spec is None:
+        raise FileNotFoundError(
+            "Unable to find '{}' in '{}'.".format(module_name, module_file_path)
+        )
+    module = importlib.util.module_from_spec(module_spec)
+    return module_spec.loader.exec_module(module)
+
+
+class TestDocumentationTechnical(ExtTestCase):
+    def run_test(self, fold: str, name: str, verbose=0) -> int:
+        ppath = os.environ.get("PYTHONPATH", "")
+        if not ppath:
+            os.environ["PYTHONPATH"] = ROOT
+        elif ROOT not in ppath:
+            sep = ";" if is_windows() else ":"
+            os.environ["PYTHONPATH"] = ppath + sep + ROOT
+        perf = time.perf_counter()
+        try:
+            mod = import_source(fold, os.path.splitext(name)[0])
+            assert mod is not None
+        except FileNotFoundError:
+            # try another way
+            cmds = [sys.executable, "-u", os.path.join(fold, name)]
+            p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            res = p.communicate()
+            out, err = res
+            st = err.decode("ascii", errors="ignore")
+            if st and "Traceback" in st:
+                if '"dot" not found in path.' in st:
+                    # dot not installed, this part
+                    # is tested in onnx framework
+                    raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
+                if (
+                    "We couldn't connect to 'https://huggingface.co'" in st
+                    or "Cannot access content at: https://huggingface.co/" in st
+                ):
+                    raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
+                raise AssertionError(  # noqa: B904
+                    "Example '{}' (cmd: {} - exec_prefix='{}') "
+                    "failed due to\n{}"
+                    "".format(name, cmds, sys.exec_prefix, st)
+                )
+        dt = time.perf_counter() - perf
+        if verbose:
+            print(f"{dt:.3f}: run {name!r}")
+        return 1
+
+    @classmethod
+    def add_test_methods(cls):
+        this = os.path.abspath(os.path.dirname(__file__))
+        fold = os.path.normpath(os.path.join(this, "..", "..", "_doc", "technical"))
+        found = os.listdir(fold)
+        for name in found:
+            if not name.endswith(".py") or not name.startswith("plot_"):
+                continue
+            reason = None
+
+            if reason:
+
+                @unittest.skip(reason)
+                def _test_(self, name=name):
+                    res = self.run_test(fold, name, verbose=VERBOSE)
+                    self.assertTrue(res)
+
+            else:
+
+                @ignore_errors(OSError)  # connectivity issues
+                def _test_(self, name=name):
+                    res = self.run_test(fold, name, verbose=VERBOSE)
+                    self.assertTrue(res)
+
+            short_name = os.path.split(os.path.splitext(name)[0])[-1]
+            setattr(cls, f"test_{short_name}", _test_)
+
+
+TestDocumentationTechnical.add_test_methods()
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_xrun_doc/test_unit_test.py b/_unittests/ut_xrun_doc/test_unit_test.py
index f8e91da3..ca2dfa08 100644
--- a/_unittests/ut_xrun_doc/test_unit_test.py
+++ b/_unittests/ut_xrun_doc/test_unit_test.py
@@ -52,7 +52,11 @@ def test_statistics_on_folders(self):
 
         df = pandas.DataFrame(stat)
         gr = df.drop("name", axis=1).groupby(["ext", "dir"]).sum().reset_index()
-        gr = gr[(gr["dir"] != "_doc/auto_examples") & (gr["dir"] != "_doc/auto_recipes")]
+        gr = gr[
+            (gr["dir"] != "_doc/auto_examples")
+            & (gr["dir"] != "_doc/auto_recipes")
+            & (gr["dir"] != "_doc/auto_technical")
+        ]
         total = (
             gr[gr["dir"].str.contains("onnx_diagnostic/")]
             .drop(["ext", "dir"], axis=1)
diff --git a/clean_onnx.sh b/clean_onnx.sh
index d2a655d6..8716d179 100644
--- a/clean_onnx.sh
+++ b/clean_onnx.sh
@@ -65,6 +65,25 @@ rm _doc/recipes/*.script.onnx
 rm _doc/recipes/dump_models -rf
 rm _doc/recipes/dump_onx_*
 
+rm _doc/technical/plot*.onnx
+rm _doc/technical/plot*.onnx.weight
+rm _doc/technical/plot*.onnx.data
+rm _doc/technical/plot*.txt
+rm _doc/technical/ort*.onnx
+rm _doc/technical/*.sarif
+rm _doc/technical/*.json
+rm _doc/technical/*.png
+rm _doc/technical/*.csv
+rm _doc/technical/*.pte
+rm _doc/technical/*.xlsx
+rm _doc/technical/dummy*.onnx
+rm _doc/technical/evaluation*-script.onnx
+rm _doc/technical/*.opt.onnx
+rm _doc/technical/*.dynamo.onnx
+rm _doc/technical/*.script.onnx
+rm _doc/technical/dump_models -rf
+rm _doc/technical/dump_onx_*
+
 rm _tools/bin -rf
 rm _tools/mambaroot -rf
 rm _tools/repos -rf
diff --git a/onnx_diagnostic/export/validate.py b/onnx_diagnostic/export/validate.py
index 8e7a000f..bf919597 100644
--- a/onnx_diagnostic/export/validate.py
+++ b/onnx_diagnostic/export/validate.py
@@ -96,7 +96,10 @@ def _get(a):
             )
             print(f"[compare_modules] discrepancies={string_diff(diff)}")
         assert not exc or (
-            diff["abs"] <= atol and diff["rel"] <= rtol
+            isinstance(diff["abs"], float)
+            and isinstance(diff["rel"], float)
+            and diff["abs"] <= atol
+            and diff["rel"] <= rtol
         ), f"Discrepancies={string_diff(diff)} higher than expected."
         return dict(args=args, kwargs=kwargs, expected=expected, got=got, diff=diff)
     return dict(args=args, kwargs=kwargs, got=got)
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 818a3ed0..a02c7ede 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -1080,8 +1080,8 @@ def max_diff(
             am = max(am, d["abs"])
             dn = max(dn, d["dnan"])
             rm = max(rm, d["rel"])
-            sm += d["sum"]
-            n += d["n"]
+            sm += d["sum"]  # type: ignore
+            n += d["n"]  # type: ignore
             if "rep" in d:
                 if drep is None:
                     drep = d["rep"].copy()
@@ -1091,7 +1091,7 @@ def max_diff(
         res = dict(abs=am, rel=rm, sum=sm, n=n, dnan=dn)
         if drep:
             res["rep"] = drep
-        return res
+        return res  # type: ignore
 
     if isinstance(expected, dict):
         if verbose >= 6:
@@ -1221,7 +1221,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, float] = dict(
+        res: Dict[str, float] = dict(  # type: ignore
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1232,7 +1232,7 @@ def max_diff(
             res["rep"] = dict(
                 zip([f">{x}" for x in hist], [int(i) for i in (cou.sum() - np.cumsum(cou))])
             )
-        return res
+        return res  # type: ignore
 
     if isinstance(expected, torch.Tensor) and isinstance(got, torch.Tensor):
         if verbose >= 6:
@@ -1332,7 +1332,7 @@ def max_diff(
                     f"_index={_index}"
                 )
 
-        res: Dict[str, float] = dict(
+        res: Dict[str, float] = dict(  # type: ignore
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
         if hist:
@@ -1349,7 +1349,7 @@ def max_diff(
                     [int(i) for i in (cou.sum() - torch.cumsum(cou, 0))],
                 )
             )
-        return res
+        return res  # type: ignore
 
     if "SquashedNormal" in expected.__class__.__name__:
         if verbose >= 6:
diff --git a/pyproject.toml b/pyproject.toml
index 79fdfd4c..3c706861 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ packages = ["onnx_diagnostic"]
 exclude = [
     "^_doc/auto_examples",  # skips examples in the documentation
     "^_doc/auto_recipes",  # skips examples in the documentation
+    "^_doc/auto_technical",  # skips examples in the documentation
     "^_doc/conf.py",
     "^_doc/examples",
     "^_unittests",  # skips unit tests
@@ -102,6 +103,7 @@ select = [
 "_doc/examples/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"]
 "_doc/notebooks/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"]
 "_doc/recipes/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"]
+"_doc/technical/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"]
 "_unittests/*/test*.py" = ["B008", "B904", "PIE808", "SIM117", "SIM105", "UP008"]
 "onnx_diagnostic/export/__init__.py" = ["F401"]
 "onnx_diagnostic/helpers/__init__.py" = ["F401"]

From a2d4e4ea7e289d99e37a02576788b1326003eec1 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 18:07:53 +0200
Subject: [PATCH 7/9] docu

---
 _doc/index.rst                              |  4 ++--
 onnx_diagnostic/helpers/onnx_helper.py      |  2 +-
 onnx_diagnostic/helpers/torch_helper.py     | 11 ++++++++---
 onnx_diagnostic/torch_models/test_helper.py |  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/_doc/index.rst b/_doc/index.rst
index 43321cdc..bafeae27 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -117,12 +117,12 @@ See :func:`onnx_diagnostic.helpers.string_type`.
 onnx_dtype_name
 +++++++++++++++
 
-See :func:`onnx_diagnostic.helpers.onnx_dtype_name`.
+See :func:`onnx_diagnostic.helpers.onnx_helper.onnx_dtype_name`.
 
 .. code-block:: python
 
         import onnx
-        from onnx_diagnostic.helpers import onnx_dtype_name
+        from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name
 
         itype = onnx.TensorProto.BFLOAT16
         print(onnx_dtype_name(itype))
diff --git a/onnx_diagnostic/helpers/onnx_helper.py b/onnx_diagnostic/helpers/onnx_helper.py
index ec9cab1d..18173592 100644
--- a/onnx_diagnostic/helpers/onnx_helper.py
+++ b/onnx_diagnostic/helpers/onnx_helper.py
@@ -834,7 +834,7 @@ def tensor_statistics(tensor: Union[np.ndarray, TensorProto]) -> Dict[str, Union
 
         import pprint
         import numpy as np
-        from onnx_diagnostic.helper.onnx_helper import tensor_statistics
+        from onnx_diagnostic.helpers.onnx_helper import tensor_statistics
 
         t = np.random.rand(40, 50).astype(np.float16)
         pprint.pprint(tensor_statistics(t))
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
index 96fdd60b..465c74b9 100644
--- a/onnx_diagnostic/helpers/torch_helper.py
+++ b/onnx_diagnostic/helpers/torch_helper.py
@@ -102,8 +102,6 @@ def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
     :param to: onnx dtype
     :return: torch dtype
     """
-    import torch
-
     if itype == onnx.TensorProto.FLOAT:
         return torch.float32
     if itype == onnx.TensorProto.FLOAT16:
@@ -314,6 +312,9 @@ def steal_forward(
 
         import torch
         from onnx_diagnostic.helpers.torch_helper import steal_forward
+        from onnx_diagnostic.helpers.mini_onnx_builder import (
+            create_input_tensors_from_onnx_model,
+        )
 
         class SubModel(torch.nn.Module):
             def forward(self, x):
@@ -363,7 +364,11 @@ def forward(self, x, y):
             for idx, m in model.named_modules():
                 level = str(idx).split(".")
                 ll = len(level)
-                _, start_line = inspect.getsourcelines(m.forward)
+                try:
+                    _, start_line = inspect.getsourcelines(m.forward)
+                except OSError:
+                    # The code is not available.
+                    start_line = 0
                 name = f"{idx}-{m.__class__.__name__}-{start_line}"
                 models.append((f"{'  ' * ll}{name}", m))
             model = models
diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py
index b187de3d..a9425ae5 100644
--- a/onnx_diagnostic/torch_models/test_helper.py
+++ b/onnx_diagnostic/torch_models/test_helper.py
@@ -780,7 +780,7 @@ def call_torch_export_export(
 
         disc = max_diff(data["expected"], expected)
         for k, v in disc.items():
-            summary[f"disc_exported_{k}"] = v
+            summary[f"disc_exported_{k}"] = str(v)
         if verbose:
             print("[validate_model] done (exported run)")
             print(f"[validate_model] exported discrepancies={string_diff(disc)}")

From f713f02de992fea85ad41873103c67e33fcda203 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 18:24:27 +0200
Subject: [PATCH 8/9] spell

---
 _doc/technical/plot_parallelized_reduction.py | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
index 79a0b8d7..3f6a576e 100644
--- a/_doc/technical/plot_parallelized_reduction.py
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -2,8 +2,8 @@
 Reproducible Parallelized Reduction is difficult
 ================================================
 
-A reduction is a frequent operation in neural network. It appears in layer normalization,
-softmax. Because of the float precision, the result of the computation
+A reduction is a frequent operation with neural networks. It appears in layer normalization,
+softmax... Because of the float precision, the result of the computation
 changes based on the order of the elements. The following examples show the variation
 based on different hypothesis on the vector distribution.
 We consider a vector :math:`X = (x_1, ..., x_n)`.
@@ -19,11 +19,13 @@
 
     norm(X)_i = \\frac{ X_i  - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}}
 
-We draw 128 random permutation of X. The average or mean should not change.
-And the normalized vector should have the same value. In the first case, we compute
+With :math:`\\mathbb{E}X = mean(X)`,
+:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\rigth)^2\\right)`.
+We draw 128 random permutations of X. The average or mean should not change.
+And the normalized vector should have the same values. In the first case, we compute
 the difference between the highest and the lowest values obtained for the average.
 In the second case, we look for the maximum difference between the original normalized
-vector and the permuted one (both sorted).
+vector and the permuted one, both sorted.
 
 The computation code
 ++++++++++++++++++++
@@ -144,7 +146,8 @@ def make_value(base, value):
 mean["name"] = "fixed"
 print(mean)
 
-
+# %%
+# And the normalized vector.
 ln = compute(values, layer_norm)
 ln["name"] = "fixed"
 DATA.append(ln.reset_index(drop=True).max(axis=0))
@@ -163,7 +166,8 @@ def make_value(base, value):
 mean["name"] = "normal"
 print(mean)
 
-
+# %%
+# And the normalized vector.
 ln = compute(values, layer_norm)
 ln["name"] = "pareto"
 DATA.append(ln.reset_index(drop=True).max(axis=0))
@@ -175,7 +179,6 @@ def make_value(base, value):
 #
 # We consider the maximum difference obtained for any sample size.
 
-print(DATA)
 df = pandas.DataFrame(DATA).set_index("name")
 print(df)
 
@@ -192,7 +195,7 @@ def make_value(base, value):
 #
 # Some of the vector have 500 values, 16x32x1024x1024. A layer normalization
 # does 16x32x1024 ~ 2M reductions, over 20 layers.
-# When a deep neural network is computed with a difference code,
+# When a deep neural network is computed with a different code
 # doing a different parallelization (GPU/CPU for example),
 # the order of the reduction may change and therefore,
 # some errors will appear and propagate.

From fd15fd18af8fe7f644f23324d0686271d31baf47 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Wed, 14 May 2025 18:27:46 +0200
Subject: [PATCH 9/9] spell

---
 _doc/technical/plot_parallelized_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
index 3f6a576e..d69ff08e 100644
--- a/_doc/technical/plot_parallelized_reduction.py
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -20,7 +20,7 @@
     norm(X)_i = \\frac{ X_i  - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}}
 
 With :math:`\\mathbb{E}X = mean(X)`,
-:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\rigth)^2\\right)`.
+:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`.
 We draw 128 random permutations of X. The average or mean should not change.
 And the normalized vector should have the same values. In the first case, we compute
 the difference between the highest and the lowest values obtained for the average.