Better documentation, implements generate_and_validate (#281)

xadupre · web-flow · commit 6963db2c23dc · 2025-10-30T14:48:53.000+01:00
* better api

* mypy
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -7,92 +7,13 @@
     requires_transformers,
     requires_torch,
 )
-from onnx_diagnostic.helpers import max_diff, flatten_object
-from onnx_diagnostic.helpers.rt_helper import onnx_generate, make_empty_cache
-from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
-from onnx_diagnostic.helpers.ort_session import InferenceSessionForTorch
+from onnx_diagnostic.helpers.rt_helper import onnx_generate, generate_and_validate
 from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.export.api import to_onnx
 
 
 class TestRtSession(ExtTestCase):
-    def simple_generate_with_cache(
-        self,
-        model,
-        input_ids: torch.Tensor,
-        eos_token_id: int,
-        session: InferenceSessionForTorch,
-        max_new_tokens: int = 100,
-    ):
-        # First call: prefill
-        attention_mask = torch.ones(
-            input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
-        )
-        feeds = {
-            **dict(zip(session.input_names[:2], [input_ids, attention_mask])),
-            **make_empty_cache(
-                input_ids.shape[0],
-                session.input_names[2:],
-                session.input_shapes[2:],
-                session.input_types[2:],
-            ),
-        }
-        onnx_results = session.run(None, feeds)
-
-        outputs = model(input_ids, use_cache=True, attention_mask=attention_mask)
-
-        diff = max_diff(outputs, onnx_results)
-        assert diff["abs"] <= 0.1, (
-            f"Unexpected issue with {type(model)}\ndiff={diff}"
-            f"\ninput_ids.shape={input_ids.shape}"
-            f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
-            f"\n     got=\n"
-            f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
-            f"feeds={self.string_type(feeds, with_shape=True, with_min_max=True)}"
-        )
-
-        # Next calls: decode
-        for iteration in range(max_new_tokens):
-            next_token_logits = outputs.logits[:, -1, :]
-            next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-            if next_token_id.item() == eos_token_id:
-                break
-            input_ids = torch.cat([input_ids, next_token_id], dim=-1)
-            attention_mask = torch.ones(
-                input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
-            )
-            feeds = dict(
-                zip(
-                    session.input_names,
-                    [
-                        t.detach()
-                        for t in torch_deepcopy(
-                            flatten_object(
-                                [next_token_id, attention_mask, outputs.past_key_values]
-                            )
-                        )
-                    ],
-                )
-            )
-            onnx_results = session.run(None, feeds)
-            outputs = model(
-                next_token_id,
-                use_cache=True,
-                past_key_values=outputs.past_key_values,
-                attention_mask=attention_mask,
-            )
-            diff = max_diff(outputs, onnx_results)
-            assert diff["abs"] <= 0.1, (
-                f"Unexpected issue with {type(model)}, iteration={iteration}"
-                f"\ndiff={diff}\ninput_ids.shape={input_ids.shape}"
-                f"\nexpected={self.string_type(outputs, with_shape=True, with_min_max=True)}"
-                f"\n     got=\n"
-                f"{self.string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
-                f"feeds={self.string_type(feeds, with_shape=True, with_min_max=True)}"
-            )
-        return input_ids
-
     @requires_transformers("4.55")
     @requires_torch("2.9")
     @hide_stdout()
@@ -118,25 +39,25 @@ def test_onnx_generate(self):
                 exporter="custom",
             )
 
-            print("-- test_onnx_generate: generate")
-            res, session = onnx_generate(
-                model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True
-            )
-            n_inputs = input_ids.shape[1]
-            self.assertEqualArray(input_ids[:1], res[:, :n_inputs])
-            self.assertEqual(res.dtype, torch.int64)
-            self.assertEqual(res.shape, (1, 13))
-            print("-- test_onnx_generate: done")
-            # expected = model.generate(input_ids[:1], max_new_tokens=10)
-            expected = self.simple_generate_with_cache(
-                model, input_ids[:1], 2, max_new_tokens=10, session=session
-            )
-            self.assertEqualArray(input_ids[:1], expected[:, :n_inputs])
-            print("******", res)
-            print("******", expected)
-            self.assertEqual(expected.dtype, torch.int64)
-            self.assertEqual(expected.shape, (1, 13))
-            self.assertEqualArray(expected, res)
+        print("-- test_onnx_generate: generate")
+        res, session = onnx_generate(
+            model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True
+        )
+        n_inputs = input_ids.shape[1]
+        self.assertEqualArray(input_ids[:1], res[:, :n_inputs])
+        self.assertEqual(res.dtype, torch.int64)
+        self.assertEqual(res.shape, (1, 13))
+        print("-- test_onnx_generate: done")
+        # expected = model.generate(input_ids[:1], max_new_tokens=10)
+        expected, _ = generate_and_validate(
+            model, input_ids[:1], 2, max_new_tokens=10, session=session
+        )
+        self.assertEqualArray(input_ids[:1], expected[:, :n_inputs])
+        print("******", res)
+        print("******", expected)
+        self.assertEqual(expected.dtype, torch.int64)
+        self.assertEqual(expected.shape, (1, 13))
+        self.assertEqualArray(expected, res)
 
 
 if __name__ == "__main__":
diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -34,7 +34,7 @@ def to_onnx(
             output_dynamic_shapes=output_dynamic_shapes,
             options=OptimizationOptions(patterns="default+onnxruntime"),
         )
-    if exporter == "onnx-dynamo":
+    if exporter in ("dynamo", "onnx-dynamo"):
         import onnxscript.rewriter.ort_fusions as ort_fusions
 
         assert (
diff --git a/onnx_diagnostic/helpers/rt_helper.py b/onnx_diagnostic/helpers/rt_helper.py
@@ -1,8 +1,9 @@
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import onnx
 import torch
-from .helper import string_type, flatten_object
+from .helper import string_type, flatten_object, max_diff
+from .torch_helper import torch_deepcopy
 from .ort_session import InferenceSessionForTorch
 
 
@@ -147,6 +148,115 @@ def make_empty_cache(
     return feeds
 
 
+def generate_and_validate(
+    model,
+    input_ids: torch.Tensor,
+    eos_token_id: int,
+    max_new_tokens: int = 100,
+    session: Optional[Union[InferenceSessionForTorch, onnx.ModelProto, str]] = None,
+    atol: float = 0.1,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, List[Dict]]]:
+    """
+    Implements a simple method ``generate`` for a torch model.
+    The function does not expect any ``position_ids`` as input.
+    The function also checks the outputs coming from an onnx model
+    are close to the output the torch model produces.
+
+    :param model_or_path: model or loaded model
+    :param input_ids: input tokens
+    :param eos_token_ids: token representing the end of an answer
+    :param max_new_tokens: stops after this number of generated tokens
+    :param session: the onnx model
+    :return: input tokens concatenated with new tokens,
+        if session is not null, it also returns the maximum differences
+        at every iterations
+
+    See example given with function :func:`onnx_generate
+    <onnx_diagnostic.helpers.rt_helper.onnx_generate>`.
+    """
+    if session is not None:
+        if not isinstance(session, InferenceSessionForTorch):
+            providers = ["CUDAExecutionProvider"] if input_ids.is_cuda else []
+            providers.append("CPUExecutionProvider")
+            session = InferenceSessionForTorch(session, providers=providers)
+
+    # First call: prefill
+    attention_mask = torch.ones(
+        input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+    )
+    if session:
+        feeds = {
+            **dict(zip(session.input_names[:2], [input_ids, attention_mask])),
+            **make_empty_cache(
+                input_ids.shape[0],
+                session.input_names[2:],
+                session.input_shapes[2:],
+                session.input_types[2:],
+            ),
+        }
+        onnx_results = session.run(None, feeds)
+
+    outputs = model(input_ids, use_cache=True, attention_mask=attention_mask)
+
+    if session:
+        diff = max_diff(outputs, onnx_results)
+        assert isinstance(diff["abs"], float) and diff["abs"] <= atol, (
+            f"Unexpected issue with {type(model)}\ndiff={diff}"
+            f"\ninput_ids.shape={input_ids.shape}"
+            f"\nexpected={string_type(outputs, with_shape=True, with_min_max=True)}"
+            f"\n     got=\n"
+            f"{string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+            f"feeds={string_type(feeds, with_shape=True, with_min_max=True)}"
+        )
+        diffs = [diff]
+
+    # Next calls: decode
+    for iteration in range(max_new_tokens):
+        next_token_logits = outputs.logits[:, -1, :]
+        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        if next_token_id.item() == eos_token_id:
+            break
+        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+        attention_mask = torch.ones(
+            input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
+        )
+        if session:
+            feeds = dict(
+                zip(
+                    session.input_names,
+                    [
+                        t.detach()
+                        for t in torch_deepcopy(
+                            flatten_object(
+                                [next_token_id, attention_mask, outputs.past_key_values]
+                            )
+                        )
+                    ],
+                )
+            )
+            onnx_results = session.run(None, feeds)
+        outputs = model(
+            next_token_id,
+            use_cache=True,
+            past_key_values=outputs.past_key_values,
+            attention_mask=attention_mask,
+        )
+        if session:
+            diff = max_diff(outputs, onnx_results)
+            assert isinstance(diff["abs"], float) and diff["abs"] <= atol, (
+                f"Unexpected issue with {type(model)}, iteration={iteration}"
+                f"\ndiff={diff}\ninput_ids.shape={input_ids.shape}"
+                f"\nexpected={string_type(outputs, with_shape=True, with_min_max=True)}"
+                f"\n     got=\n"
+                f"{string_type(onnx_results, with_shape=True, with_min_max=True)}\n"
+                f"feeds={string_type(feeds, with_shape=True, with_min_max=True)}"
+            )
+            diffs.append(diff)
+    if session:
+        return input_ids, diffs
+    return input_ids
+
+
 def onnx_generate(
     model_or_path: Union[onnx.ModelProto, str, InferenceSessionForTorch],
     input_ids: torch.Tensor,
@@ -167,6 +277,54 @@ def onnx_generate(
         <onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch>`
         created if necessary
     :return: input tokens concatenated with new tokens
+
+    .. runpython::
+        :showcode:
+
+        import os
+        from onnx_diagnostic.helpers import string_type, string_diff
+        from onnx_diagnostic.helpers.rt_helper import onnx_generate, generate_and_validate
+        from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
+        from onnx_diagnostic.torch_export_patches import torch_export_patches
+        from onnx_diagnostic.export.api import to_onnx
+
+        mid = "arnir0/Tiny-LLM"
+        print(f"-- get model for {mid!r}")
+        data = get_untrained_model_with_inputs(mid)
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        del inputs["position_ids"]
+        del ds["position_ids"]
+        input_ids = inputs["input_ids"]
+
+        print(f"-- input_ids={input_ids.shape}")
+        print(f"-- inputs: {string_type(inputs, with_shape=True)}")
+        print(f"-- dynamic_shapes: {string_type(ds)}")
+        folder = "dump_test"
+        os.makedirs(folder, exist_ok=True)
+        model_name = os.path.join(folder, "model.onnx")
+        print("-- test_onnx_generate: export model")
+        with torch_export_patches(patch_transformers=True, patch_torch=False):
+            to_onnx(
+                model,
+                (),
+                kwargs=inputs,
+                dynamic_shapes=ds,
+                filename=model_name,
+                exporter="custom",  # custom, dynamo or onnx-dynamo, modelbuilder
+            )
+
+        print("-- onnx_generate")
+        onnx_outputs = onnx_generate(model_name, input_ids[:1], 2, max_new_tokens=10)
+        print("-- onnx output", onnx_outputs)
+
+        print("-- generate")
+        torch_outputs, diffs = generate_and_validate(
+            model, input_ids[:1], 2, max_new_tokens=10, session=model_name
+        )
+        print("-- torch output", torch_outputs)
+        print("-- differences at each step:")
+        for i, d in enumerate(diffs):
+            print(f"iteration {i}: {string_diff(d)}")
     """
     if not isinstance(model_or_path, InferenceSessionForTorch):
         providers = ["CUDAExecutionProvider"] if input_ids.is_cuda else []

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def to_onnx(`
`34`	`34`	`output_dynamic_shapes=output_dynamic_shapes,`
`35`	`35`	`options=OptimizationOptions(patterns="default+onnxruntime"),`
`36`	`36`	`)`
`37`		`- if exporter == "onnx-dynamo":`
	`37`	`+ if exporter in ("dynamo", "onnx-dynamo"):`
`38`	`38`	`import onnxscript.rewriter.ort_fusions as ort_fusions`
`39`	`39`
`40`	`40`	`assert (`