add onnx_generate

xadupre · xadupre · commit 8fc81510447d · 2025-10-27T16:59:18.000+01:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.7.17
 ++++++
 
+* :pr:`276`: implements onnx_generate which implements method generate for an onnx model
 * :pr:`275`: fixes function ``patched_vmap``
 
 0.7.16
diff --git a/_doc/api/export/api.rst b/_doc/api/export/api.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.api
+==========================
+
+.. automodule:: onnx_diagnostic.export.api
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/export/index.rst b/_doc/api/export/index.rst
@@ -5,6 +5,7 @@ onnx_diagnostic.export
     :maxdepth: 1
     :caption: modules
 
+    api
     dynamic_shapes
     shape_helper
     validate
diff --git a/_doc/technical/plot_generate.py b/_doc/technical/plot_generate.py
@@ -14,28 +14,47 @@
 epkg:`microsoft/Phi-1.5` is a small LLM. The example given
 """
 
+import os
 import time
+import sys
 import pandas
 from tqdm import tqdm
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import string_type
+from onnx_diagnostic.helpers.torch_helper import to_any, get_weight_type
+from onnx_diagnostic.helpers.rt_helper import onnx_generate
+from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config, task_from_id
+from onnx_diagnostic.tasks import random_input_kwargs
+from onnx_diagnostic.export.api import to_onnx
+
 
-device = "cuda" if torch.cuda.is_available else "cpu"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 data = []
 
 print("-- load the model...")
-# unit_test_going() returns True if UNITTEST_GOING is 1
 if unit_test_going():
+    # unit_test_going() returns True if UNITTEST_GOING is 1
+    # The example switches to a faster scenario.
     model_id = "arnir0/Tiny-LLM"
-    model = get_untrained_model_with_inputs(model_id)["model"]
+    data_export = get_untrained_model_with_inputs(model_id)
+    model = data_export["model"]
+    export_inputs = data_export["inputs"]
+    export_shapes = data_export["dynamic_shapes"]
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 else:
     model_id = "microsoft/phi-1_5"
     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    config = get_pretrained_config(model_id)
+    task = task = task_from_id(model_id)
+    kwargs, fct = random_input_kwargs(config, task)
+    res = fct(model, config, add_second_input=False, **kwargs)
+    export_inputs = res["inputs"]
+    export_shapes = res["dynamic_shapes"]
 model = model.to(device)
 print("-- done.")
 
@@ -52,11 +71,11 @@
 
 print("-- compute the answer...")
 begin = time.perf_counter()
-outputs = model.generate(**inputs, max_length=100)
+outputs = model.generate(**inputs, max_new_tokens=100)
 duration = time.perf_counter() - begin
 print(f"-- done in {duration}")
 data.append(dict(name="generate", duration=duration))
-print("output shape:", string_type(outputs, with_shape=True))
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
 print("-- decode the answer...")
 text = tokenizer.batch_decode(outputs)[0]
 print("-- done.")
@@ -79,35 +98,29 @@
 def simple_generate_with_cache(
     model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
 ):
-    answer = []
-    # First call.
+    # First call: prefill
     outputs = model(input_ids, use_cache=True)
-    next_token_logits = outputs.logits[:, -1, :]
-    past_key_values = outputs.past_key_values
 
-    # Next calls.
+    # Next calls: decode
     for _ in tqdm(list(range(max_new_tokens))):
+        next_token_logits = outputs.logits[:, -1, :]
+        past_key_values = outputs.past_key_values
+
         # The most probable next token is chosen.
         next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
         # But we could select it using a multinomial law
         # <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
         # <<< top_probs, top_indices = torch.topk(probs, top_k)
         # <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]
 
-        # Let's add the predicted token to the answer.
-        answer.append(next_token_id)
+        if next_token_id.item() == eos_token_id:
+            break
+        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
 
         # Feed only the new token, but with the cache
         outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)
-        next_token_logits = outputs.logits[:, -1, :]
-        past_key_values = outputs.past_key_values
-
-        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
 
-        if next_token_id.item() == eos_token_id:
-            break
-
-    return torch.cat(answer, dim=1)
+    return input_ids
 
 
 print("-- compute the answer with custom generate...")
@@ -120,12 +133,77 @@ def simple_generate_with_cache(
 data.append(dict(name="custom", duration=duration))
 
 print("-- done.")
-print("output shape:", string_type(outputs, with_shape=True))
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
+print("-- decode the answer...")
+text = tokenizer.batch_decode(outputs)[0]
+print("-- done.")
+print(text)
+
+# %%
+# Method generate for onnx models
+# ===============================
+#
+# We first need to export the model into ONNX.
+#
+# ONNX Conversion
+# +++++++++++++++
+
+if "position_ids" in export_inputs:
+    del export_inputs["position_ids"]
+    del export_shapes["position_ids"]
+dtype = get_weight_type(model)
+print("-- model dtype:", dtype)
+export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
+exporter = "custom" if "custom" in sys.argv else "onnx-dynamo"
+model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
+if not os.path.exists(model_name):
+    # This step is slow so let's skip it if it was already done.
+    print("-- conversion to ONNX.")
+    begin = time.perf_counter()
+    with torch_export_patches(patch_transformers=True):
+        to_onnx(
+            model,
+            (),
+            kwargs=to_any(export_inputs, device),
+            dynamic_shapes=export_shapes,
+            filename=model_name,
+            verbose=1,
+            exporter=exporter,
+        )
+    duration = time.perf_counter() - begin
+    print(f"-- done in {duration}")
+
+# %%
+# onnx_generate
+# +++++++++++++
+#
+# Then we can call method generate for two tokens.
+# This function is part of :epkg:`onnx_diagnostic` but follows the implementation
+# seen earlier for a torch model.
+# Let's ask first the function to return the session to avoid creating on the second call.
+
+_res, session = onnx_generate(
+    model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True
+)
+
+# And now the full answer.
+print("-- compute the answer with custom generate...")
+begin = time.perf_counter()
+outputs = onnx_generate(
+    session, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
+)
+duration = time.perf_counter() - begin
+print(f"-- done in {duration}")
+data.append(dict(name="onnx", duration=duration))
+
+print("-- done.")
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
 print("-- decode the answer...")
 text = tokenizer.batch_decode(outputs)[0]
 print("-- done.")
 print(text)
 
+
 # %%
 # Plots
 # =====
diff --git a/_unittests/ut_export/test_api.py b/_unittests/ut_export/test_api.py
@@ -0,0 +1,34 @@
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.export.api import to_onnx
+
+
+class TestValidate(ExtTestCase):
+    @hide_stdout()
+    def test_to_onnx(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        x = torch.randn((5, 6))
+        y = torch.randn((1, 6))
+        ds = ({0: "a", 1: "b"}, {1: "b"})
+        to_onnx(
+            Model(),
+            (x, y),
+            dynamic_shapes=ds,
+            exporter="custom",
+            filename=self.get_dump_file("custom.onnx"),
+        )
+        to_onnx(
+            Model(),
+            (x, y),
+            dynamic_shapes=ds,
+            exporter="onnx-dynamo",
+            filename=self.get_dump_file("onnx-dynamo.onnx"),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -0,0 +1,42 @@
+import os
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.helpers.rt_helper import onnx_generate
+from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_export_patches import torch_export_patches
+
+
+class TestRtSession(ExtTestCase):
+    @hide_stdout()
+    def test_onnx_generate(self):
+        from experimental_experiment.torch_interpreter import to_onnx
+
+        mid = "arnir0/Tiny-LLM"
+        print("-- test_onnx_generate: get model")
+        data = get_untrained_model_with_inputs(mid)
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        del inputs["position_ids"]
+        del ds["position_ids"]
+        input_ids = inputs["input_ids"]
+        folder = self.get_dump_folder("test_onnx_generate")
+        model_name = os.path.join(folder, "model.onnx")
+        print("-- test_onnx_generate: export model")
+        with torch_export_patches(patch_transformers=True, patch_torch=False):
+            to_onnx(
+                model,
+                (),
+                kwargs=inputs,
+                dynamic_shapes=ds,
+                filename=model_name,
+            )
+
+        print("-- test_onnx_generate: generate")
+        res = onnx_generate(model_name, input_ids[:1], 2, max_new_tokens=10)
+        self.assertEqual(res.dtype, torch.int64)
+        self.assertEqual(res.shape, (1, 13))
+        print("-- test_onnx_generate: done")
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_helpers/test_torch_helper.py b/_unittests/ut_helpers/test_torch_helper.py
@@ -8,6 +8,7 @@
 from onnx_diagnostic.helpers import max_diff, string_type
 from onnx_diagnostic.helpers.torch_helper import (
     dummy_llm,
+    get_weight_type,
     to_numpy,
     is_torchdynamo_exporting,
     model_statistics,
@@ -415,6 +416,11 @@ def test_to_tensor(self):
                 c = to_tensor(proto)
                 self.assertEqualArray(a, c)
 
+    def test_get_weight_type(self):
+        model, _inputs = dummy_llm("LLM")
+        dt = get_weight_type(model)
+        self.assertEqual(torch.float32, dt)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, List, Sequence, Optional, Tuple, Union
+import torch
+
+
+def to_onnx(
+    mod: Union["torch.nn.Module", "torch.fx.GraphModule"],  # noqa: F821
+    args: Optional[Sequence["torch.Tensor"]] = None,  # noqa: F821
+    kwargs: Optional[Dict[str, "torch.Tensor"]] = None,  # noqa: F821
+    input_names: Optional[Sequence[str]] = None,
+    target_opset: Optional[Union[int, Dict[str, int]]] = None,
+    verbose: int = 0,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    filename: Optional[str] = None,
+    output_names: Optional[List[str]] = None,
+    output_dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    exporter: str = "onnx-dynamo",
+) -> Any:
+    """Common API for exporters."""
+    if exporter == "custom":
+        from experimental_experiment.torch_interpreter import to_onnx as _to_onnx
+        from experimental_experiment.xbuilder import OptimizationOptions
+
+        return _to_onnx(
+            mod,
+            args=args,
+            kwargs=kwargs,
+            input_names=input_names,
+            output_names=output_names,
+            target_opset=target_opset,
+            verbose=verbose,
+            filename=filename,
+            dynamic_shapes=dynamic_shapes,
+            large_model=True,
+            output_dynamic_shapes=output_dynamic_shapes,
+            options=OptimizationOptions(patterns="default+onnxruntime"),
+        )
+    if exporter == "onnx-dynamo":
+        import onnxscript.rewriter.ort_fusions as ort_fusions
+
+        assert (
+            not output_dynamic_shapes
+        ), f"output_dynamic_shapes not supported for exporter={exporter!r}"
+        epo = torch.onnx.export(
+            mod,
+            args=args,
+            kwargs=kwargs,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=target_opset,
+            dynamic_shapes=dynamic_shapes,
+        )
+        ort_fusions.optimize_for_ort(epo.model)
+        epo.save(filename)
+        return epo
+
+    raise ValueError(f"Unknown exporter={exporter!r}")
diff --git a/onnx_diagnostic/helpers/ort_session.py b/onnx_diagnostic/helpers/ort_session.py
@@ -135,6 +135,10 @@ def __init__(
         self.sess = sess
         self.input_names = [i.name for i in sess.get_inputs()]
         self.output_names = [i.name for i in sess.get_outputs()]
+        self.input_shapes = [i.shape for i in sess.get_inputs()]
+        self.output_shapes = [i.shape for i in sess.get_outputs()]
+        self.input_types = [i.type for i in sess.get_inputs()]
+        self.output_types = [i.type for i in sess.get_outputs()]
         self.torch = torch
         self.nvtx = nvtx
         self.run_options = onnxruntime.RunOptions()
diff --git a/onnx_diagnostic/helpers/rt_helper.py b/onnx_diagnostic/helpers/rt_helper.py
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py