From 95881048dd9e72799b1a4dc6996b80295e65c3e8 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 11:21:16 +0200 Subject: [PATCH 1/9] add argmax --- onnx_diagnostic/helpers/helper.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py index 51a1c054..784fc89f 100644 --- a/onnx_diagnostic/helpers/helper.py +++ b/onnx_diagnostic/helpers/helper.py @@ -1181,6 +1181,7 @@ def max_diff( if exp_cpu.size == got_cpu.size else (np.inf, np.inf, np.inf, 0, np.inf) ) + argm = None else: abs_diff, rel_diff, sum_diff, n_diff, nan_diff = ( float(diff.max()), @@ -1189,6 +1190,7 @@ def max_diff( float(diff.size), float(ndiff.sum()), ) + argm = tuple(map(int, np.unravel_index(diff.argmax(), diff.shape))) if verbose >= 10 and (abs_diff >= 10 or rel_diff >= 10): # To understand the value it comes from. if debug_info: @@ -1219,7 +1221,9 @@ def max_diff( f"_index={_index}" ) - res = dict(abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff) + res = dict( + abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm + ) if hist: if isinstance(hist, bool): hist = np.array([0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], dtype=diff.dtype) @@ -1284,8 +1288,10 @@ def max_diff( float(diff.numel()), float(ndiff.sum()), ) + argm = tuple(map(int, torch.unravel_index(diff.argmax(), diff.shape))) elif got_cpu.numel() == exp_cpu.numel(): abs_diff, rel_diff, sum_diff, n_diff, nan_diff = (0.0, 0.0, 0.0, 0.0, 0.0) + argm = None else: abs_diff, rel_diff, sum_diff, n_diff, nan_diff = ( np.inf, @@ -1294,6 +1300,7 @@ def max_diff( np.inf, np.inf, ) + argm = None if verbose >= 10 and (abs_diff >= 10 or rel_diff >= 10): # To understand the value it comes from. @@ -1325,7 +1332,9 @@ def max_diff( f"_index={_index}" ) - res = dict(abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff) + res = dict( + abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm + ) if hist: if isinstance(hist, bool): hist = torch.tensor( @@ -1478,6 +1487,8 @@ def string_diff(diff: Dict[str, Any]) -> str: rows.append(f"#{v}{k}") suffix = "-".join(rows) suffix = f"/{suffix}" + if "argm" in diff: + suffix += f", argmax={diff['argm']}" if diff.get("dnan", None): if diff["abs"] == 0 or diff["rel"] == 0: return f"abs={diff['abs']}, rel={diff['rel']}, dnan={diff['dnan']}{suffix}" From c4bfe8c7f063b56151682839d5a7b491e69ad4fb Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 12:04:05 +0200 Subject: [PATCH 2/9] ut --- _unittests/ut_helpers/test_bench_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_unittests/ut_helpers/test_bench_run.py b/_unittests/ut_helpers/test_bench_run.py index c0aaa98f..ac9579e3 100644 --- a/_unittests/ut_helpers/test_bench_run.py +++ b/_unittests/ut_helpers/test_bench_run.py @@ -105,14 +105,14 @@ def test_make_configs_replace(self): def test_max_diff(self): self.assertEqual( max_diff(torch.Tensor([1, 2]), torch.Tensor([1, 2])), - {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0}, + {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)}, ) self.assertEqual( max_diff( (torch.Tensor([1, 2]),), (torch.Tensor([1, 2])), ), - {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0}, + {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)}, ) self.assertEqual( max_diff( From da83cea66890185fb697dcec9ce779232bf5cfe5 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 14:06:28 +0200 Subject: [PATCH 3/9] fix git --- _unittests/ut_helpers/test_helper.py | 2 +- onnx_diagnostic/helpers/helper.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py index faa09fc6..b42ed072 100644 --- a/_unittests/ut_helpers/test_helper.py +++ b/_unittests/ut_helpers/test_helper.py @@ -245,7 +245,7 @@ def test_max_diff_hist_array_string_diff(self): diff = max_diff(x, y, hist=True) s = string_diff(diff) self.assertEndsWith( - "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0", s + "/#8>0.0-#8>0.0001-#6>0.001-#5>0.01-#5>0.1-#3>1.0-#2>10.0-#1>100.0,amax=2,1", s ) def test_max_diff_hist_tensor(self): diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py index 784fc89f..47e8f702 100644 --- a/onnx_diagnostic/helpers/helper.py +++ b/onnx_diagnostic/helpers/helper.py @@ -2,7 +2,7 @@ import enum import inspect from dataclasses import is_dataclass, fields -from typing import Any, Callable, Dict, List, Optional, Set, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import numpy as np @@ -872,7 +872,7 @@ def max_diff( _index: int = 0, allow_unique_tensor_with_list_of_one_element: bool = True, hist: Optional[Union[bool, List[float]]] = None, -) -> Dict[str, float]: +) -> Dict[str, Union[float, str, int, Tuple[int, ...]]]: """ Returns the maximum discrepancy. @@ -1221,7 +1221,7 @@ def max_diff( f"_index={_index}" ) - res = dict( + res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1332,7 +1332,7 @@ def max_diff( f"_index={_index}" ) - res = dict( + res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1488,7 +1488,12 @@ def string_diff(diff: Dict[str, Any]) -> str: suffix = "-".join(rows) suffix = f"/{suffix}" if "argm" in diff: - suffix += f", argmax={diff['argm']}" + sa = ( + ",".join(map(str, diff["argm"])) + if isinstance(diff["argm"], tuple) + else str(diff["argm"]) + ) + suffix += f",amax={sa}" if diff.get("dnan", None): if diff["abs"] == 0 or diff["rel"] == 0: return f"abs={diff['abs']}, rel={diff['rel']}, dnan={diff['dnan']}{suffix}" From 83a272cde972c465b4221987cf9e88a478cca37d Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 15:20:18 +0200 Subject: [PATCH 4/9] fix issues --- onnx_diagnostic/helpers/helper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py index 47e8f702..23a8154f 100644 --- a/onnx_diagnostic/helpers/helper.py +++ b/onnx_diagnostic/helpers/helper.py @@ -872,7 +872,7 @@ def max_diff( _index: int = 0, allow_unique_tensor_with_list_of_one_element: bool = True, hist: Optional[Union[bool, List[float]]] = None, -) -> Dict[str, Union[float, str, int, Tuple[int, ...]]]: +) -> Dict[str, Union[float, int, Tuple[int, ...]]]: """ Returns the maximum discrepancy. @@ -1221,7 +1221,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict( + res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1332,7 +1332,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, Union[str, int, float, Tuple[int, ...]]] = dict( + res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: From 0e5e11de0e891713deaaadd62fa334072e63b593 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 16:11:21 +0200 Subject: [PATCH 5/9] mypy --- onnx_diagnostic/ext_test_case.py | 4 +++- onnx_diagnostic/helpers/helper.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py index 9662dc74..c7c55c61 100644 --- a/onnx_diagnostic/ext_test_case.py +++ b/onnx_diagnostic/ext_test_case.py @@ -1195,7 +1195,9 @@ def assert_onnx_disc( if verbose: print(f"[{vname}] diff {string_diff(diff)}") assert ( - not numpy.isnan(diff["abs"]) + isinstance(diff["abs"], float) + and isinstance(diff["rel"], float) + and not numpy.isnan(diff["abs"]) and diff["abs"] <= atol and not numpy.isnan(diff["rel"]) and diff["rel"] <= rtol diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py index 23a8154f..818a3ed0 100644 --- a/onnx_diagnostic/helpers/helper.py +++ b/onnx_diagnostic/helpers/helper.py @@ -1221,7 +1221,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict( + res: Dict[str, float] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1332,7 +1332,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, Union[int, float, Tuple[int, ...]]] = dict( + res: Dict[str, float] = dict( abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: From a41b3d1002516debdb63e85d94f6ca261c785aaf Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 17:53:12 +0200 Subject: [PATCH 6/9] add one gallery --- .github/workflows/check-release.yml | 1 + .gitignore | 1 + _doc/conf.py | 2 + _doc/index.rst | 1 + _doc/technical/README.txt | 2 + _doc/technical/plot_parallelized_reduction.py | 198 ++++++++++++++++++ .../test_documentation_technical.py | 97 +++++++++ _unittests/ut_xrun_doc/test_unit_test.py | 6 +- clean_onnx.sh | 19 ++ onnx_diagnostic/export/validate.py | 5 +- onnx_diagnostic/helpers/helper.py | 14 +- pyproject.toml | 2 + 12 files changed, 339 insertions(+), 9 deletions(-) create mode 100644 _doc/technical/README.txt create mode 100644 _doc/technical/plot_parallelized_reduction.py create mode 100644 _unittests/ut_xrun_doc/test_documentation_technical.py diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml index a179377b..8bf5e057 100644 --- a/.github/workflows/check-release.yml +++ b/.github/workflows/check-release.yml @@ -74,6 +74,7 @@ jobs: echo _unittests/ >> .git/info/sparse-checkout echo _doc/examples/ >> .git/info/sparse-checkout echo _doc/recipes/ >> .git/info/sparse-checkout + echo _doc/technical/ >> .git/info/sparse-checkout echo pyproject.toml >> .git/info/sparse-checkout echo requirements-dev.txt >> .git/info/sparse-checkout git pull origin main diff --git a/.gitignore b/.gitignore index ce041d2e..2b716f95 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ prof plot_*.txt _doc/auto_examples/* _doc/auto_recipes/* +_doc/auto_technical/* _doc/sg_execution_times.rst _doc/examples/_cache/* _doc/examples/dump_models/* diff --git a/_doc/conf.py b/_doc/conf.py index 77daebca..7936a4fd 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -156,11 +156,13 @@ def linkcode_resolve(domain, info): "examples_dirs": [ os.path.join(os.path.dirname(__file__), "examples"), os.path.join(os.path.dirname(__file__), "recipes"), + os.path.join(os.path.dirname(__file__), "technical"), ], # path where to save gallery generated examples "gallery_dirs": [ "auto_examples", "auto_recipes", + "auto_technical", ], # no parallelization to avoid conflict with environment variables "parallel": 1, diff --git a/_doc/index.rst b/_doc/index.rst index f2f374bf..43321cdc 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -39,6 +39,7 @@ It also implements tools to investigate, validate exported models (ExportedProgr cmds/index auto_examples/index auto_recipes/index + auto_technical/index .. toctree:: :maxdepth: 1 diff --git a/_doc/technical/README.txt b/_doc/technical/README.txt new file mode 100644 index 00000000..d9b5a645 --- /dev/null +++ b/_doc/technical/README.txt @@ -0,0 +1,2 @@ +Technical Details +================= diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py new file mode 100644 index 00000000..79a0b8d7 --- /dev/null +++ b/_doc/technical/plot_parallelized_reduction.py @@ -0,0 +1,198 @@ +""" +Reproducible Parallelized Reduction is difficult +================================================ + +A reduction is a frequent operation in neural network. It appears in layer normalization, +softmax. Because of the float precision, the result of the computation +changes based on the order of the elements. The following examples show the variation +based on different hypothesis on the vector distribution. +We consider a vector :math:`X = (x_1, ..., x_n)`. +It computes the average: + +.. math:: + + mean(X) = \\frac{\\sum_{i=1}^n x_i}{n} + +Or the normalization of the vector: + +.. math:: + + norm(X)_i = \\frac{ X_i - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}} + +We draw 128 random permutation of X. The average or mean should not change. +And the normalized vector should have the same value. In the first case, we compute +the difference between the highest and the lowest values obtained for the average. +In the second case, we look for the maximum difference between the original normalized +vector and the permuted one (both sorted). + +The computation code +++++++++++++++++++++ +""" + +import itertools +from tqdm import tqdm +import numpy as np +import pandas + +DATA = [] + + +def str_dtype(dtype): + """Displays numpy dtype in a nicer way.""" + if dtype == np.float64: + return "fp64" + if dtype == np.float32: + return "fp32" + if dtype == np.float16: + return "fp16" + raise ValueError(f"Unexpected value {dtype}") + + +def layer_norm(a, eps=1e-6): + """ + Normalized the vector a. + The computation is done in float32 or float64. + """ + ctype = np.float32 if a.dtype == np.float16 else a.dtype + a32 = a.astype(ctype) + m = a32.mean(axis=-1, keepdims=True) + c = a32 - m + va = np.sqrt((c * c).mean(axis=-1, keepdims=True)) + va += eps + return (c / va).astype(a.dtype) + + +def compute(values, fct): + """ + Compare the results of function ``fct`` on a sample. + Loops over multiple sizes, dtypes. Tries 128 times. + """ + + def make_value(base, value): + if value.size > 1: + return np.abs(np.sort(base) - np.sort(value)).max() + return value + + sizes = [2, 4, 8, 16, 512, 1024, 2048, 4096, 8192] + dtypes = [np.float64, np.float32, np.float16] + N = list(range(128)) + exps = list(itertools.product(sizes, dtypes, N)) + data = [] + ech = None + for size, dtype, n in tqdm(exps): + if n == 0: + ech = values[:size].astype(dtype) + base = fct(ech) + assert base.dtype == ech.dtype + obs = dict( + n=n, size=size, dtype=str_dtype(ech.dtype), value=make_value(base, fct(ech)) + ) + data.append(obs) + + if n == 1: + new_ech = np.sort(ech) + elif n == 2: + new_ech = np.sort(ech)[::-1] + else: + new_ech = np.random.permutation(ech) + assert new_ech.dtype == ech.dtype + assert new_ech.shape == ech.shape + obs = dict( + n=n + 1, + size=size, + dtype=str_dtype(new_ech.dtype), + value=make_value(base, fct(new_ech)), + ) + data.append(obs) + + df = pandas.DataFrame(data) + agg = df.drop("n", axis=1).groupby(["dtype", "size"], as_index=False).agg(["min", "max"]) + agg["value", "delta"] = agg["value", "max"] - agg["value", "min"] + piv = agg.pivot(index="size", columns="dtype", values=("value", "delta")) + return piv + + +# %% +# Normal Law +# ++++++++++ +# +# Let's see what it returns an on random sample following a normal law. +# First the average. + +values = np.random.randn(4096) +mean = compute(values, lambda x: np.mean(x).astype(x.dtype)) +mean["name"] = "normal" +print(mean) + +# %% +# Then the layer normalization. + +ln = compute(values, layer_norm) +ln["name"] = "normal" +DATA.append(ln.reset_index(drop=True).max(axis=0)) +print(ln) + +# %% +# Fixed values +# ++++++++++++ +# +# We try a fixed vector with one very high value and all the others are small. + +values[:] = -1e-4 +values[::128] = 100 +mean = compute(values, lambda x: np.mean(x).astype(x.dtype)) +mean["name"] = "fixed" +print(mean) + + +ln = compute(values, layer_norm) +ln["name"] = "fixed" +DATA.append(ln.reset_index(drop=True).max(axis=0)) +print(ln) + +# %% +# Pareto Distribution +# +++++++++++++++++++ +# +# A law with a long tail. + +values = np.random.pareto(1, (4096,)) +print(values) + +mean = compute(values, lambda x: np.mean(x).astype(x.dtype)) +mean["name"] = "normal" +print(mean) + + +ln = compute(values, layer_norm) +ln["name"] = "pareto" +DATA.append(ln.reset_index(drop=True).max(axis=0)) +print(ln) + +# %% +# Summary +# +++++++ +# +# We consider the maximum difference obtained for any sample size. + +print(DATA) +df = pandas.DataFrame(DATA).set_index("name") +print(df) + +# %% +# Visually. + +ax = df.plot.bar(logy=True) +fig = ax.get_figure() +fig.savefig("plot_parallelized_reduction.png") + +# %% +# In a deep neural network +# ++++++++++++++++++++++++ +# +# Some of the vector have 500 values, 16x32x1024x1024. A layer normalization +# does 16x32x1024 ~ 2M reductions, over 20 layers. +# When a deep neural network is computed with a difference code, +# doing a different parallelization (GPU/CPU for example), +# the order of the reduction may change and therefore, +# some errors will appear and propagate. diff --git a/_unittests/ut_xrun_doc/test_documentation_technical.py b/_unittests/ut_xrun_doc/test_documentation_technical.py new file mode 100644 index 00000000..7ed73e87 --- /dev/null +++ b/_unittests/ut_xrun_doc/test_documentation_technical.py @@ -0,0 +1,97 @@ +import unittest +import os +import sys +import importlib.util +import subprocess +import time +from onnx_diagnostic import __file__ as onnx_diagnostic_file +from onnx_diagnostic.ext_test_case import ExtTestCase, is_windows, ignore_errors + + +VERBOSE = 0 +ROOT = os.path.realpath(os.path.abspath(os.path.join(onnx_diagnostic_file, "..", ".."))) + + +def import_source(module_file_path, module_name): + if not os.path.exists(module_file_path): + raise FileNotFoundError(module_file_path) + module_spec = importlib.util.spec_from_file_location(module_name, module_file_path) + if module_spec is None: + raise FileNotFoundError( + "Unable to find '{}' in '{}'.".format(module_name, module_file_path) + ) + module = importlib.util.module_from_spec(module_spec) + return module_spec.loader.exec_module(module) + + +class TestDocumentationTechnical(ExtTestCase): + def run_test(self, fold: str, name: str, verbose=0) -> int: + ppath = os.environ.get("PYTHONPATH", "") + if not ppath: + os.environ["PYTHONPATH"] = ROOT + elif ROOT not in ppath: + sep = ";" if is_windows() else ":" + os.environ["PYTHONPATH"] = ppath + sep + ROOT + perf = time.perf_counter() + try: + mod = import_source(fold, os.path.splitext(name)[0]) + assert mod is not None + except FileNotFoundError: + # try another way + cmds = [sys.executable, "-u", os.path.join(fold, name)] + p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate() + out, err = res + st = err.decode("ascii", errors="ignore") + if st and "Traceback" in st: + if '"dot" not found in path.' in st: + # dot not installed, this part + # is tested in onnx framework + raise unittest.SkipTest(f"failed: {name!r} due to missing dot.") + if ( + "We couldn't connect to 'https://huggingface.co'" in st + or "Cannot access content at: https://huggingface.co/" in st + ): + raise unittest.SkipTest(f"Connectivity issues due to\n{err}") + raise AssertionError( # noqa: B904 + "Example '{}' (cmd: {} - exec_prefix='{}') " + "failed due to\n{}" + "".format(name, cmds, sys.exec_prefix, st) + ) + dt = time.perf_counter() - perf + if verbose: + print(f"{dt:.3f}: run {name!r}") + return 1 + + @classmethod + def add_test_methods(cls): + this = os.path.abspath(os.path.dirname(__file__)) + fold = os.path.normpath(os.path.join(this, "..", "..", "_doc", "technical")) + found = os.listdir(fold) + for name in found: + if not name.endswith(".py") or not name.startswith("plot_"): + continue + reason = None + + if reason: + + @unittest.skip(reason) + def _test_(self, name=name): + res = self.run_test(fold, name, verbose=VERBOSE) + self.assertTrue(res) + + else: + + @ignore_errors(OSError) # connectivity issues + def _test_(self, name=name): + res = self.run_test(fold, name, verbose=VERBOSE) + self.assertTrue(res) + + short_name = os.path.split(os.path.splitext(name)[0])[-1] + setattr(cls, f"test_{short_name}", _test_) + + +TestDocumentationTechnical.add_test_methods() + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/_unittests/ut_xrun_doc/test_unit_test.py b/_unittests/ut_xrun_doc/test_unit_test.py index f8e91da3..ca2dfa08 100644 --- a/_unittests/ut_xrun_doc/test_unit_test.py +++ b/_unittests/ut_xrun_doc/test_unit_test.py @@ -52,7 +52,11 @@ def test_statistics_on_folders(self): df = pandas.DataFrame(stat) gr = df.drop("name", axis=1).groupby(["ext", "dir"]).sum().reset_index() - gr = gr[(gr["dir"] != "_doc/auto_examples") & (gr["dir"] != "_doc/auto_recipes")] + gr = gr[ + (gr["dir"] != "_doc/auto_examples") + & (gr["dir"] != "_doc/auto_recipes") + & (gr["dir"] != "_doc/auto_technical") + ] total = ( gr[gr["dir"].str.contains("onnx_diagnostic/")] .drop(["ext", "dir"], axis=1) diff --git a/clean_onnx.sh b/clean_onnx.sh index d2a655d6..8716d179 100644 --- a/clean_onnx.sh +++ b/clean_onnx.sh @@ -65,6 +65,25 @@ rm _doc/recipes/*.script.onnx rm _doc/recipes/dump_models -rf rm _doc/recipes/dump_onx_* +rm _doc/technical/plot*.onnx +rm _doc/technical/plot*.onnx.weight +rm _doc/technical/plot*.onnx.data +rm _doc/technical/plot*.txt +rm _doc/technical/ort*.onnx +rm _doc/technical/*.sarif +rm _doc/technical/*.json +rm _doc/technical/*.png +rm _doc/technical/*.csv +rm _doc/technical/*.pte +rm _doc/technical/*.xlsx +rm _doc/technical/dummy*.onnx +rm _doc/technical/evaluation*-script.onnx +rm _doc/technical/*.opt.onnx +rm _doc/technical/*.dynamo.onnx +rm _doc/technical/*.script.onnx +rm _doc/technical/dump_models -rf +rm _doc/technical/dump_onx_* + rm _tools/bin -rf rm _tools/mambaroot -rf rm _tools/repos -rf diff --git a/onnx_diagnostic/export/validate.py b/onnx_diagnostic/export/validate.py index 8e7a000f..bf919597 100644 --- a/onnx_diagnostic/export/validate.py +++ b/onnx_diagnostic/export/validate.py @@ -96,7 +96,10 @@ def _get(a): ) print(f"[compare_modules] discrepancies={string_diff(diff)}") assert not exc or ( - diff["abs"] <= atol and diff["rel"] <= rtol + isinstance(diff["abs"], float) + and isinstance(diff["rel"], float) + and diff["abs"] <= atol + and diff["rel"] <= rtol ), f"Discrepancies={string_diff(diff)} higher than expected." return dict(args=args, kwargs=kwargs, expected=expected, got=got, diff=diff) return dict(args=args, kwargs=kwargs, got=got) diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py index 818a3ed0..a02c7ede 100644 --- a/onnx_diagnostic/helpers/helper.py +++ b/onnx_diagnostic/helpers/helper.py @@ -1080,8 +1080,8 @@ def max_diff( am = max(am, d["abs"]) dn = max(dn, d["dnan"]) rm = max(rm, d["rel"]) - sm += d["sum"] - n += d["n"] + sm += d["sum"] # type: ignore + n += d["n"] # type: ignore if "rep" in d: if drep is None: drep = d["rep"].copy() @@ -1091,7 +1091,7 @@ def max_diff( res = dict(abs=am, rel=rm, sum=sm, n=n, dnan=dn) if drep: res["rep"] = drep - return res + return res # type: ignore if isinstance(expected, dict): if verbose >= 6: @@ -1221,7 +1221,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, float] = dict( + res: Dict[str, float] = dict( # type: ignore abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1232,7 +1232,7 @@ def max_diff( res["rep"] = dict( zip([f">{x}" for x in hist], [int(i) for i in (cou.sum() - np.cumsum(cou))]) ) - return res + return res # type: ignore if isinstance(expected, torch.Tensor) and isinstance(got, torch.Tensor): if verbose >= 6: @@ -1332,7 +1332,7 @@ def max_diff( f"_index={_index}" ) - res: Dict[str, float] = dict( + res: Dict[str, float] = dict( # type: ignore abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm ) if hist: @@ -1349,7 +1349,7 @@ def max_diff( [int(i) for i in (cou.sum() - torch.cumsum(cou, 0))], ) ) - return res + return res # type: ignore if "SquashedNormal" in expected.__class__.__name__: if verbose >= 6: diff --git a/pyproject.toml b/pyproject.toml index 79fdfd4c..3c706861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ packages = ["onnx_diagnostic"] exclude = [ "^_doc/auto_examples", # skips examples in the documentation "^_doc/auto_recipes", # skips examples in the documentation + "^_doc/auto_technical", # skips examples in the documentation "^_doc/conf.py", "^_doc/examples", "^_unittests", # skips unit tests @@ -102,6 +103,7 @@ select = [ "_doc/examples/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"] "_doc/notebooks/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"] "_doc/recipes/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"] +"_doc/technical/plot_*.py" = ["E402", "B018", "PIE808", "SIM105", "SIM117"] "_unittests/*/test*.py" = ["B008", "B904", "PIE808", "SIM117", "SIM105", "UP008"] "onnx_diagnostic/export/__init__.py" = ["F401"] "onnx_diagnostic/helpers/__init__.py" = ["F401"] From a2d4e4ea7e289d99e37a02576788b1326003eec1 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 18:07:53 +0200 Subject: [PATCH 7/9] docu --- _doc/index.rst | 4 ++-- onnx_diagnostic/helpers/onnx_helper.py | 2 +- onnx_diagnostic/helpers/torch_helper.py | 11 ++++++++--- onnx_diagnostic/torch_models/test_helper.py | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/_doc/index.rst b/_doc/index.rst index 43321cdc..bafeae27 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -117,12 +117,12 @@ See :func:`onnx_diagnostic.helpers.string_type`. onnx_dtype_name +++++++++++++++ -See :func:`onnx_diagnostic.helpers.onnx_dtype_name`. +See :func:`onnx_diagnostic.helpers.onnx_helper.onnx_dtype_name`. .. code-block:: python import onnx - from onnx_diagnostic.helpers import onnx_dtype_name + from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name itype = onnx.TensorProto.BFLOAT16 print(onnx_dtype_name(itype)) diff --git a/onnx_diagnostic/helpers/onnx_helper.py b/onnx_diagnostic/helpers/onnx_helper.py index ec9cab1d..18173592 100644 --- a/onnx_diagnostic/helpers/onnx_helper.py +++ b/onnx_diagnostic/helpers/onnx_helper.py @@ -834,7 +834,7 @@ def tensor_statistics(tensor: Union[np.ndarray, TensorProto]) -> Dict[str, Union import pprint import numpy as np - from onnx_diagnostic.helper.onnx_helper import tensor_statistics + from onnx_diagnostic.helpers.onnx_helper import tensor_statistics t = np.random.rand(40, 50).astype(np.float16) pprint.pprint(tensor_statistics(t)) diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py index 96fdd60b..465c74b9 100644 --- a/onnx_diagnostic/helpers/torch_helper.py +++ b/onnx_diagnostic/helpers/torch_helper.py @@ -102,8 +102,6 @@ def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype": # noqa: F821 :param to: onnx dtype :return: torch dtype """ - import torch - if itype == onnx.TensorProto.FLOAT: return torch.float32 if itype == onnx.TensorProto.FLOAT16: @@ -314,6 +312,9 @@ def steal_forward( import torch from onnx_diagnostic.helpers.torch_helper import steal_forward + from onnx_diagnostic.helpers.mini_onnx_builder import ( + create_input_tensors_from_onnx_model, + ) class SubModel(torch.nn.Module): def forward(self, x): @@ -363,7 +364,11 @@ def forward(self, x, y): for idx, m in model.named_modules(): level = str(idx).split(".") ll = len(level) - _, start_line = inspect.getsourcelines(m.forward) + try: + _, start_line = inspect.getsourcelines(m.forward) + except OSError: + # The code is not available. + start_line = 0 name = f"{idx}-{m.__class__.__name__}-{start_line}" models.append((f"{' ' * ll}{name}", m)) model = models diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py index b187de3d..a9425ae5 100644 --- a/onnx_diagnostic/torch_models/test_helper.py +++ b/onnx_diagnostic/torch_models/test_helper.py @@ -780,7 +780,7 @@ def call_torch_export_export( disc = max_diff(data["expected"], expected) for k, v in disc.items(): - summary[f"disc_exported_{k}"] = v + summary[f"disc_exported_{k}"] = str(v) if verbose: print("[validate_model] done (exported run)") print(f"[validate_model] exported discrepancies={string_diff(disc)}") From f713f02de992fea85ad41873103c67e33fcda203 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 18:24:27 +0200 Subject: [PATCH 8/9] spell --- _doc/technical/plot_parallelized_reduction.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py index 79a0b8d7..3f6a576e 100644 --- a/_doc/technical/plot_parallelized_reduction.py +++ b/_doc/technical/plot_parallelized_reduction.py @@ -2,8 +2,8 @@ Reproducible Parallelized Reduction is difficult ================================================ -A reduction is a frequent operation in neural network. It appears in layer normalization, -softmax. Because of the float precision, the result of the computation +A reduction is a frequent operation with neural networks. It appears in layer normalization, +softmax... Because of the float precision, the result of the computation changes based on the order of the elements. The following examples show the variation based on different hypothesis on the vector distribution. We consider a vector :math:`X = (x_1, ..., x_n)`. @@ -19,11 +19,13 @@ norm(X)_i = \\frac{ X_i - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}} -We draw 128 random permutation of X. The average or mean should not change. -And the normalized vector should have the same value. In the first case, we compute +With :math:`\\mathbb{E}X = mean(X)`, +:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\rigth)^2\\right)`. +We draw 128 random permutations of X. The average or mean should not change. +And the normalized vector should have the same values. In the first case, we compute the difference between the highest and the lowest values obtained for the average. In the second case, we look for the maximum difference between the original normalized -vector and the permuted one (both sorted). +vector and the permuted one, both sorted. The computation code ++++++++++++++++++++ @@ -144,7 +146,8 @@ def make_value(base, value): mean["name"] = "fixed" print(mean) - +# %% +# And the normalized vector. ln = compute(values, layer_norm) ln["name"] = "fixed" DATA.append(ln.reset_index(drop=True).max(axis=0)) @@ -163,7 +166,8 @@ def make_value(base, value): mean["name"] = "normal" print(mean) - +# %% +# And the normalized vector. ln = compute(values, layer_norm) ln["name"] = "pareto" DATA.append(ln.reset_index(drop=True).max(axis=0)) @@ -175,7 +179,6 @@ def make_value(base, value): # # We consider the maximum difference obtained for any sample size. -print(DATA) df = pandas.DataFrame(DATA).set_index("name") print(df) @@ -192,7 +195,7 @@ def make_value(base, value): # # Some of the vector have 500 values, 16x32x1024x1024. A layer normalization # does 16x32x1024 ~ 2M reductions, over 20 layers. -# When a deep neural network is computed with a difference code, +# When a deep neural network is computed with a different code # doing a different parallelization (GPU/CPU for example), # the order of the reduction may change and therefore, # some errors will appear and propagate. From fd15fd18af8fe7f644f23324d0686271d31baf47 Mon Sep 17 00:00:00 2001 From: xadupre Date: Wed, 14 May 2025 18:27:46 +0200 Subject: [PATCH 9/9] spell --- _doc/technical/plot_parallelized_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py index 3f6a576e..d69ff08e 100644 --- a/_doc/technical/plot_parallelized_reduction.py +++ b/_doc/technical/plot_parallelized_reduction.py @@ -20,7 +20,7 @@ norm(X)_i = \\frac{ X_i - \\mathbb{E}X}{ \\sqrt{ \\mathbb{V}X}} With :math:`\\mathbb{E}X = mean(X)`, -:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\rigth)^2\\right)`. +:math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`. We draw 128 random permutations of X. The average or mean should not change. And the normalized vector should have the same values. In the first case, we compute the difference between the highest and the lowest values obtained for the average.