sdpython · sdpython · May 14, 2025 · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml
@@ -74,6 +74,7 @@ jobs:
           echo _unittests/ >> .git/info/sparse-checkout
           echo _doc/examples/ >> .git/info/sparse-checkout
           echo _doc/recipes/ >> .git/info/sparse-checkout
+          echo _doc/technical/ >> .git/info/sparse-checkout
           echo pyproject.toml >> .git/info/sparse-checkout
           echo requirements-dev.txt >> .git/info/sparse-checkout
           git pull origin main

diff --git a/.gitignore b/.gitignore
@@ -57,6 +57,7 @@ prof
 plot_*.txt
 _doc/auto_examples/*
 _doc/auto_recipes/*
+_doc/auto_technical/*
 _doc/sg_execution_times.rst
 _doc/examples/_cache/*
 _doc/examples/dump_models/*

diff --git a/_doc/conf.py b/_doc/conf.py
@@ -156,11 +156,13 @@ def linkcode_resolve(domain, info):
     "examples_dirs": [
         os.path.join(os.path.dirname(__file__), "examples"),
         os.path.join(os.path.dirname(__file__), "recipes"),
+        os.path.join(os.path.dirname(__file__), "technical"),
     ],
     # path where to save gallery generated examples
     "gallery_dirs": [
         "auto_examples",
         "auto_recipes",
+        "auto_technical",
     ],
     # no parallelization to avoid conflict with environment variables
     "parallel": 1,

diff --git a/_doc/index.rst b/_doc/index.rst
@@ -39,6 +39,7 @@ It also implements tools to investigate, validate exported models (ExportedProgr
     cmds/index
     auto_examples/index
     auto_recipes/index
+    auto_technical/index
 
 .. toctree::
     :maxdepth: 1
@@ -116,12 +117,12 @@ See :func:`onnx_diagnostic.helpers.string_type`.
 onnx_dtype_name
 +++++++++++++++
 
-See :func:`onnx_diagnostic.helpers.onnx_dtype_name`.
+See :func:`onnx_diagnostic.helpers.onnx_helper.onnx_dtype_name`.
 
 .. code-block:: python
 
         import onnx
-        from onnx_diagnostic.helpers import onnx_dtype_name
+        from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name
 
         itype = onnx.TensorProto.BFLOAT16
         print(onnx_dtype_name(itype))

diff --git a/_doc/technical/README.txt b/_doc/technical/README.txt
@@ -0,0 +1,2 @@
+Technical Details
+=================
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
@@ -0,0 +1,201 @@
+"""
+Reproducible Parallelized Reduction is difficult
+================================================
+
+A reduction is a frequent operation with neural networks. It appears in layer normalization,
+softmax... Because of the float precision, the result of the computation
+changes based on the order of the elements. The following examples show the variation
+based on different hypothesis on the vector distribution.
+We consider a vector :math:`X = (x_1, ..., x_n)`.
+It computes the average:
+
+.. math::
+
+    mean(X) = \\frac{\\sum_{i=1}^n x_i}{n}
+
+Or the normalization of the vector:
+
+.. math::
+
+    norm(X)_i = \\frac{ X_i  - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}}
+
+With :math:`\\mathbb{E}X = mean(X)`,
+:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`.
+We draw 128 random permutations of X. The average or mean should not change.
+And the normalized vector should have the same values. In the first case, we compute
+the difference between the highest and the lowest values obtained for the average.
+In the second case, we look for the maximum difference between the original normalized
+vector and the permuted one, both sorted.
+
+The computation code
+++++++++++++++++++++
+"""
+
+import itertools
+from tqdm import tqdm
+import numpy as np
+import pandas
+
+DATA = []
+
+
+def str_dtype(dtype):
+    """Displays numpy dtype in a nicer way."""
+    if dtype == np.float64:
+        return "fp64"
+    if dtype == np.float32:
+        return "fp32"
+    if dtype == np.float16:
+        return "fp16"
+    raise ValueError(f"Unexpected value {dtype}")
+
+
+def layer_norm(a, eps=1e-6):
+    """
+    Normalized the vector a.
+    The computation is done in float32 or float64.
+    """
+    ctype = np.float32 if a.dtype == np.float16 else a.dtype
+    a32 = a.astype(ctype)
+    m = a32.mean(axis=-1, keepdims=True)
+    c = a32 - m
+    va = np.sqrt((c * c).mean(axis=-1, keepdims=True))
+    va += eps
+    return (c / va).astype(a.dtype)
+
+
+def compute(values, fct):
+    """
+    Compare the results of function ``fct`` on a sample.
+    Loops over multiple sizes, dtypes. Tries 128 times.
+    """
+
+    def make_value(base, value):
+        if value.size > 1:
+            return np.abs(np.sort(base) - np.sort(value)).max()
+        return value
+
+    sizes = [2, 4, 8, 16, 512, 1024, 2048, 4096, 8192]
+    dtypes = [np.float64, np.float32, np.float16]
+    N = list(range(128))
+    exps = list(itertools.product(sizes, dtypes, N))
+    data = []
+    ech = None
+    for size, dtype, n in tqdm(exps):
+        if n == 0:
+            ech = values[:size].astype(dtype)
+            base = fct(ech)
+            assert base.dtype == ech.dtype
+            obs = dict(
+                n=n, size=size, dtype=str_dtype(ech.dtype), value=make_value(base, fct(ech))
+            )
+            data.append(obs)
+
+        if n == 1:
+            new_ech = np.sort(ech)
+        elif n == 2:
+            new_ech = np.sort(ech)[::-1]
+        else:
+            new_ech = np.random.permutation(ech)
+        assert new_ech.dtype == ech.dtype
+        assert new_ech.shape == ech.shape
+        obs = dict(
+            n=n + 1,
+            size=size,
+            dtype=str_dtype(new_ech.dtype),
+            value=make_value(base, fct(new_ech)),
+        )
+        data.append(obs)
+
+    df = pandas.DataFrame(data)
+    agg = df.drop("n", axis=1).groupby(["dtype", "size"], as_index=False).agg(["min", "max"])
+    agg["value", "delta"] = agg["value", "max"] - agg["value", "min"]
+    piv = agg.pivot(index="size", columns="dtype", values=("value", "delta"))
+    return piv
+
+
+# %%
+# Normal Law
+# ++++++++++
+#
+# Let's see what it returns an on random sample following a normal law.
+# First the average.
+
+values = np.random.randn(4096)
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "normal"
+print(mean)
+
+# %%
+# Then the layer normalization.
+
+ln = compute(values, layer_norm)
+ln["name"] = "normal"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Fixed values
+# ++++++++++++
+#
+# We try a fixed vector with one very high value and all the others are small.
+
+values[:] = -1e-4
+values[::128] = 100
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "fixed"
+print(mean)
+
+# %%
+# And the normalized vector.
+ln = compute(values, layer_norm)
+ln["name"] = "fixed"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Pareto Distribution
+# +++++++++++++++++++
+#
+# A law with a long tail.
+
+values = np.random.pareto(1, (4096,))
+print(values)
+
+mean = compute(values, lambda x: np.mean(x).astype(x.dtype))
+mean["name"] = "normal"
+print(mean)
+
+# %%
+# And the normalized vector.
+ln = compute(values, layer_norm)
+ln["name"] = "pareto"
+DATA.append(ln.reset_index(drop=True).max(axis=0))
+print(ln)
+
+# %%
+# Summary
+# +++++++
+#
+# We consider the maximum difference obtained for any sample size.
+
+df = pandas.DataFrame(DATA).set_index("name")
+print(df)
+
+# %%
+# Visually.
+
+ax = df.plot.bar(logy=True)
+fig = ax.get_figure()
+fig.savefig("plot_parallelized_reduction.png")
+
+# %%
+# In a deep neural network
+# ++++++++++++++++++++++++
+#
+# Some of the vector have 500 values, 16x32x1024x1024. A layer normalization
+# does 16x32x1024 ~ 2M reductions, over 20 layers.
+# When a deep neural network is computed with a different code
+# doing a different parallelization (GPU/CPU for example),
+# the order of the reduction may change and therefore,
+# some errors will appear and propagate.
diff --git a/_unittests/ut_helpers/test_bench_run.py b/_unittests/ut_helpers/test_bench_run.py
@@ -105,14 +105,14 @@ def test_make_configs_replace(self):
     def test_max_diff(self):
         self.assertEqual(
             max_diff(torch.Tensor([1, 2]), torch.Tensor([1, 2])),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
         )
         self.assertEqual(
             max_diff(
                 (torch.Tensor([1, 2]),),
                 (torch.Tensor([1, 2])),
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
         )
         self.assertEqual(
             max_diff(

diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
@@ -245,7 +245,7 @@ def test_max_diff_hist_array_string_diff(self):
         diff = max_diff(x, y, hist=True)
         s = string_diff(diff)
         self.assertEndsWith(
-            "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0", s
+            "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0,amax=2,1", s
         )
 
     def test_max_diff_hist_tensor(self):

diff --git a/_unittests/ut_xrun_doc/test_documentation_technical.py b/_unittests/ut_xrun_doc/test_documentation_technical.py
@@ -0,0 +1,97 @@
+import unittest
+import os
+import sys
+import importlib.util
+import subprocess
+import time
+from onnx_diagnostic import __file__ as onnx_diagnostic_file
+from onnx_diagnostic.ext_test_case import ExtTestCase, is_windows, ignore_errors
+
+
+VERBOSE = 0
+ROOT = os.path.realpath(os.path.abspath(os.path.join(onnx_diagnostic_file, "..", "..")))
+
+
+def import_source(module_file_path, module_name):
+    if not os.path.exists(module_file_path):
+        raise FileNotFoundError(module_file_path)
+    module_spec = importlib.util.spec_from_file_location(module_name, module_file_path)
+    if module_spec is None:
+        raise FileNotFoundError(
+            "Unable to find '{}' in '{}'.".format(module_name, module_file_path)
+        )
+    module = importlib.util.module_from_spec(module_spec)
+    return module_spec.loader.exec_module(module)
+
+
+class TestDocumentationTechnical(ExtTestCase):
+    def run_test(self, fold: str, name: str, verbose=0) -> int:
+        ppath = os.environ.get("PYTHONPATH", "")
+        if not ppath:
+            os.environ["PYTHONPATH"] = ROOT
+        elif ROOT not in ppath:
+            sep = ";" if is_windows() else ":"
+            os.environ["PYTHONPATH"] = ppath + sep + ROOT
+        perf = time.perf_counter()
+        try:
+            mod = import_source(fold, os.path.splitext(name)[0])
+            assert mod is not None
+        except FileNotFoundError:
+            # try another way
+            cmds = [sys.executable, "-u", os.path.join(fold, name)]
+            p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            res = p.communicate()
+            out, err = res
+            st = err.decode("ascii", errors="ignore")
+            if st and "Traceback" in st:
+                if '"dot" not found in path.' in st:
+                    # dot not installed, this part
+                    # is tested in onnx framework
+                    raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
+                if (
+                    "We couldn't connect to 'https://huggingface.co'" in st
+                    or "Cannot access content at: https://huggingface.co/" in st
+                ):
+                    raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
+                raise AssertionError(  # noqa: B904
+                    "Example '{}' (cmd: {} - exec_prefix='{}') "
+                    "failed due to\n{}"
+                    "".format(name, cmds, sys.exec_prefix, st)
+                )
+        dt = time.perf_counter() - perf
+        if verbose:
+            print(f"{dt:.3f}: run {name!r}")
+        return 1
+
+    @classmethod
+    def add_test_methods(cls):
+        this = os.path.abspath(os.path.dirname(__file__))
+        fold = os.path.normpath(os.path.join(this, "..", "..", "_doc", "technical"))
+        found = os.listdir(fold)
+        for name in found:
+            if not name.endswith(".py") or not name.startswith("plot_"):
+                continue
+            reason = None
+
+            if reason:
+
+                @unittest.skip(reason)
+                def _test_(self, name=name):
+                    res = self.run_test(fold, name, verbose=VERBOSE)
+                    self.assertTrue(res)
+
+            else:
+
+                @ignore_errors(OSError)  # connectivity issues
+                def _test_(self, name=name):
+                    res = self.run_test(fold, name, verbose=VERBOSE)
+                    self.assertTrue(res)
+
+            short_name = os.path.split(os.path.splitext(name)[0])[-1]
+            setattr(cls, f"test_{short_name}", _test_)
+
+
+TestDocumentationTechnical.add_test_methods()
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_xrun_doc/test_unit_test.py b/_unittests/ut_xrun_doc/test_unit_test.py
@@ -52,7 +52,11 @@ def test_statistics_on_folders(self):
 
         df = pandas.DataFrame(stat)
         gr = df.drop("name", axis=1).groupby(["ext", "dir"]).sum().reset_index()
-        gr = gr[(gr["dir"] != "_doc/auto_examples") & (gr["dir"] != "_doc/auto_recipes")]
+        gr = gr[
+            (gr["dir"] != "_doc/auto_examples")
+            & (gr["dir"] != "_doc/auto_recipes")
+            & (gr["dir"] != "_doc/auto_technical")
+        ]
         total = (
             gr[gr["dir"].str.contains("onnx_diagnostic/")]
             .drop(["ext", "dir"], axis=1)