sdpython
diff --git a/‎CHANGELOGS.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOGS.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎_doc/api/export/api.rst‎
Lines changed: 7 additions & 0 deletions b/‎_doc/api/export/api.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎_doc/api/export/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎_doc/api/export/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎_doc/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎_doc/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎_doc/technical/plot_generate.py‎
Lines changed: 217 additions & 0 deletions b/‎_doc/technical/plot_generate.py‎
Lines changed: 217 additions & 0 deletions
diff --git a/‎_unittests/ut_export/test_api.py‎
Lines changed: 34 additions & 0 deletions b/‎_unittests/ut_export/test_api.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎_unittests/ut_helpers/test_rt_helper.py‎
Lines changed: 42 additions & 0 deletions b/‎_unittests/ut_helpers/test_rt_helper.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎_unittests/ut_helpers/test_torch_helper.py‎
Lines changed: 6 additions & 0 deletions b/‎_unittests/ut_helpers/test_torch_helper.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎_unittests/ut_xrun_doc/test_documentation_technical.py‎
Lines changed: 9 additions & 1 deletion b/‎_unittests/ut_xrun_doc/test_documentation_technical.py‎
Lines changed: 9 additions & 1 deletion
@@ -4,6 +4,7 @@ Change Logs
 0.7.17
 ++++++
 
+* :pr:`276`: implements onnx_generate which implements method generate for an onnx model
 * :pr:`275`: fixes function ``patched_vmap``
 
 0.7.16
 
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.api
+==========================
+
+.. automodule:: onnx_diagnostic.export.api
+    :members:
+    :no-undoc-members:
@@ -5,6 +5,7 @@ onnx_diagnostic.export
     :maxdepth: 1
     :caption: modules
 
+    api
     dynamic_shapes
     shape_helper
     validate
 
@@ -277,6 +277,7 @@ def linkcode_resolve(domain, info):
 epkg_dictionary.update(
     {
         "arnir0/Tiny-LLM": "https://huggingface.co/arnir0/Tiny-LLM",
+        "microsoft/Phi-1.5": "https://huggingface.co/microsoft/phi-1_5",
         "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2",
         "microsoft/Phi-3.5-mini-instruct": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
         "microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
 
@@ -0,0 +1,217 @@
+"""
+.. _l-plot-generate:
+
+=================================
+From a LLM to processing a prompt
+=================================
+
+Method ``generate`` generates the model answer for a given prompt.
+Let's implement our own to understand better how it works and
+then apply it to an ONNX model.
+
+Example with Phi 1.5
+====================
+
+epkg:`microsoft/Phi-1.5` is a small LLM. The example given
+"""
+
+import os
+import time
+import sys
+import pandas
+from tqdm import tqdm
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from onnx_diagnostic.ext_test_case import unit_test_going
+from onnx_diagnostic.helpers import string_type
+from onnx_diagnostic.helpers.torch_helper import to_any, get_weight_type
+from onnx_diagnostic.helpers.rt_helper import onnx_generate
+from onnx_diagnostic.torch_export_patches import torch_export_patches
+from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config, task_from_id
+from onnx_diagnostic.tasks import random_input_kwargs
+from onnx_diagnostic.export.api import to_onnx
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+data = []
+
+print("-- load the model...")
+if unit_test_going():
+    # unit_test_going() returns True if UNITTEST_GOING is 1
+    # The example switches to a faster scenario.
+    model_id = "arnir0/Tiny-LLM"
+    data_export = get_untrained_model_with_inputs(model_id)
+    model = data_export["model"]
+    export_inputs = data_export["inputs"]
+    export_shapes = data_export["dynamic_shapes"]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+else:
+    model_id = "microsoft/phi-1_5"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    config = get_pretrained_config(model_id)
+    task = task = task_from_id(model_id)
+    kwargs, fct = random_input_kwargs(config, task)
+    res = fct(model, config, add_second_input=False, **kwargs)
+    export_inputs = res["inputs"]
+    export_shapes = res["dynamic_shapes"]
+model = model.to(device)
+print("-- done.")
+
+print("-- tokenize the prompt...")
+inputs = tokenizer(
+    '''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """''',
+    return_tensors="pt",
+    return_attention_mask=False,
+).to(device)
+print("-- done.")
+
+print("-- compute the answer...")
+begin = time.perf_counter()
+outputs = model.generate(**inputs, max_new_tokens=100)
+duration = time.perf_counter() - begin
+print(f"-- done in {duration}")
+data.append(dict(name="generate", duration=duration))
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
+print("-- decode the answer...")
+text = tokenizer.batch_decode(outputs)[0]
+print("-- done.")
+print(text)
+
+
+# %%
+# eos_token_id?
+# =============
+#
+# This token means the end of the answer.
+
+print("eos_token_id=", tokenizer.eos_token_id)
+
+# %%
+# Custom method generate
+# ======================
+
+
+def simple_generate_with_cache(
+    model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
+):
+    # First call: prefill
+    outputs = model(input_ids, use_cache=True)
+
+    # Next calls: decode
+    for _ in tqdm(list(range(max_new_tokens))):
+        next_token_logits = outputs.logits[:, -1, :]
+        past_key_values = outputs.past_key_values
+
+        # The most probable next token is chosen.
+        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        # But we could select it using a multinomial law
+        # <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
+        # <<< top_probs, top_indices = torch.topk(probs, top_k)
+        # <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]
+
+        if next_token_id.item() == eos_token_id:
+            break
+        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+
+        # Feed only the new token, but with the cache
+        outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)
+
+    return input_ids
+
+
+print("-- compute the answer with custom generate...")
+begin = time.perf_counter()
+outputs = simple_generate_with_cache(
+    model, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
+)
+duration = time.perf_counter() - begin
+print(f"-- done in {duration}")
+data.append(dict(name="custom", duration=duration))
+
+print("-- done.")
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
+print("-- decode the answer...")
+text = tokenizer.batch_decode(outputs)[0]
+print("-- done.")
+print(text)
+
+# %%
+# Method generate for onnx models
+# ===============================
+#
+# We first need to export the model into ONNX.
+#
+# ONNX Conversion
+# +++++++++++++++
+
+if "position_ids" in export_inputs:
+    del export_inputs["position_ids"]
+    del export_shapes["position_ids"]
+dtype = get_weight_type(model)
+print("-- model dtype:", dtype)
+export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
+exporter = "custom" if "custom" in sys.argv else "onnx-dynamo"
+model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
+if not os.path.exists(model_name):
+    # This step is slow so let's skip it if it was already done.
+    print("-- conversion to ONNX.")
+    begin = time.perf_counter()
+    with torch_export_patches(patch_transformers=True):
+        to_onnx(
+            model,
+            (),
+            kwargs=to_any(export_inputs, device),
+            dynamic_shapes=export_shapes,
+            filename=model_name,
+            verbose=1,
+            exporter=exporter,
+        )
+    duration = time.perf_counter() - begin
+    print(f"-- done in {duration}")
+
+# %%
+# onnx_generate
+# +++++++++++++
+#
+# Then we can call method generate for two tokens.
+# This function is part of :mod:`onnx_diagnostic` but follows the implementation
+# seen earlier for a torch model.
+# Let's ask first the function to return the session to avoid creating on the second call.
+
+_res, session = onnx_generate(
+    model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True
+)
+
+# And now the full answer.
+print("-- compute the answer with custom generate...")
+begin = time.perf_counter()
+outputs = onnx_generate(
+    session, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
+)
+duration = time.perf_counter() - begin
+print(f"-- done in {duration}")
+data.append(dict(name="onnx", duration=duration))
+
+print("-- done.")
+print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
+print("-- decode the answer...")
+text = tokenizer.batch_decode(outputs)[0]
+print("-- done.")
+print(text)
+
+
+# %%
+# Plots
+# =====
+df = pandas.DataFrame(data).set_index("name")
+print(df)
+
+# %%
+ax = df.plot(kind="bar", title="Time (s) comparison to generate a prompt.", rot=45)
+ax.figure.tight_layout()
+ax.figure.savefig("plot_generate.png")
@@ -0,0 +1,34 @@
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.export.api import to_onnx
+
+
+class TestValidate(ExtTestCase):
+    @hide_stdout()
+    def test_to_onnx(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        x = torch.randn((5, 6))
+        y = torch.randn((1, 6))
+        ds = ({0: "a", 1: "b"}, {1: "b"})
+        to_onnx(
+            Model(),
+            (x, y),
+            dynamic_shapes=ds,
+            exporter="custom",
+            filename=self.get_dump_file("custom.onnx"),
+        )
+        to_onnx(
+            Model(),
+            (x, y),
+            dynamic_shapes=ds,
+            exporter="onnx-dynamo",
+            filename=self.get_dump_file("onnx-dynamo.onnx"),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
@@ -0,0 +1,42 @@
+import os
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.helpers.rt_helper import onnx_generate
+from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_export_patches import torch_export_patches
+
+
+class TestRtSession(ExtTestCase):
+    @hide_stdout()
+    def test_onnx_generate(self):
+        from experimental_experiment.torch_interpreter import to_onnx
+
+        mid = "arnir0/Tiny-LLM"
+        print("-- test_onnx_generate: get model")
+        data = get_untrained_model_with_inputs(mid)
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        del inputs["position_ids"]
+        del ds["position_ids"]
+        input_ids = inputs["input_ids"]
+        folder = self.get_dump_folder("test_onnx_generate")
+        model_name = os.path.join(folder, "model.onnx")
+        print("-- test_onnx_generate: export model")
+        with torch_export_patches(patch_transformers=True, patch_torch=False):
+            to_onnx(
+                model,
+                (),
+                kwargs=inputs,
+                dynamic_shapes=ds,
+                filename=model_name,
+            )
+
+        print("-- test_onnx_generate: generate")
+        res = onnx_generate(model_name, input_ids[:1], 2, max_new_tokens=10)
+        self.assertEqual(res.dtype, torch.int64)
+        self.assertEqual(res.shape, (1, 13))
+        print("-- test_onnx_generate: done")
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
@@ -8,6 +8,7 @@
 from onnx_diagnostic.helpers import max_diff, string_type
 from onnx_diagnostic.helpers.torch_helper import (
     dummy_llm,
+    get_weight_type,
     to_numpy,
     is_torchdynamo_exporting,
     model_statistics,
@@ -415,6 +416,11 @@ def test_to_tensor(self):
                 c = to_tensor(proto)
                 self.assertEqualArray(a, c)
 
+    def test_get_weight_type(self):
+        model, _inputs = dummy_llm("LLM")
+        dt = get_weight_type(model)
+        self.assertEqual(torch.float32, dt)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
@@ -6,7 +6,12 @@
 import time
 import torch
 from onnx_diagnostic import __file__ as onnx_diagnostic_file
-from onnx_diagnostic.ext_test_case import ExtTestCase, is_windows, ignore_errors
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    is_windows,
+    ignore_errors,
+    has_transformers,
+)
 
 
 VERBOSE = 0
@@ -80,6 +85,9 @@ def add_test_methods(cls):
             if not reason and torch.__version__.startswith("2.9.0"):
                 reason = "examples are failing for on CI for 2.9.0"
 
+            if not reason and not has_transformers("4.55.0") and name in {"plot_generate.py"}:
+                reason = "transformers 4.55 is required"
+
             if reason:
 
                 @unittest.skip(reason)
Original file line number	Diff line number	Diff line change
`@@ -277,6 +277,7 @@ def linkcode_resolve(domain, info):`
`277`	`277`	`epkg_dictionary.update(`
`278`	`278`	`{`
`279`	`279`	`"arnir0/Tiny-LLM": "https://huggingface.co/arnir0/Tiny-LLM",`
	`280`	`+ "microsoft/Phi-1.5": "https://huggingface.co/microsoft/phi-1_5",`
`280`	`281`	`"microsoft/phi-2": "https://huggingface.co/microsoft/phi-2",`
`281`	`282`	`"microsoft/Phi-3.5-mini-instruct": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",`
`282`	`283`	`"microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",`