Adds prompt to test speedup with onnx_generate (#287)

xadupre · web-flow · commit 143deef1881f · 2025-11-06T22:19:06.000+01:00
* Adds prompt to test speedup with onnx_generate

* prompt

* fix

* try

* doc
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.8.1
 +++++
 
+* :pr:`287`: adds input ``'inputs_prompt'`` to test a LLM, meant to be used during validation
 * :pr:`288`: add .contiguous in torch.cond branch (attention patch for sdpa implementation)
 * :pr:`286`: adds variable to track random nodes in models
 
diff --git a/_doc/technical/plot_generate.py b/_doc/technical/plot_generate.py
@@ -186,7 +186,7 @@ def simple_generate_with_cache(
 # seen earlier for a torch model.
 # Let's ask first the function to return the session to avoid creating on the second call.
 
-_res, session = onnx_generate(
+_res, session, _feeds = onnx_generate(
     model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True
 )
 
diff --git a/_unittests/ut_helpers/test_rt_helper.py b/_unittests/ut_helpers/test_rt_helper.py
@@ -48,7 +48,7 @@ def test_onnx_generate(self):
             )
 
         print("-- test_onnx_generate: generate")
-        res, session = onnx_generate(
+        res, session, _feeds = onnx_generate(
             model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True
         )
         n_inputs = input_ids.shape[1]
diff --git a/_unittests/ut_tasks/test_tasks_text_generation.py b/_unittests/ut_tasks/test_tasks_text_generation.py
@@ -5,12 +5,14 @@
     hide_stdout,
     requires_transformers,
     requires_torch,
+    ignore_warnings,
 )
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
 from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
 from onnx_diagnostic.export.shape_helper import make_fake_with_dynamic_dimensions
+from onnx_diagnostic.helpers.rt_helper import onnx_generate, generate_and_validate
 
 
 class TestTasksTextGeneration(ExtTestCase):
@@ -75,6 +77,26 @@ def test_text_generation_tiny_llm(self):
         self.assertEqualAny(expected.past_key_values, got.past_key_values)
         self.assertEqualArray(expected.logits, got.logits)
 
+    @hide_stdout()
+    @requires_transformers("4.53")
+    @requires_torch("2.8.99")  # check_guards not supported
+    @ignore_warnings(FutureWarning)
+    def test_text_generation_tiny_llm_prompt_validation(self):
+        from experimental_experiment.torch_interpreter import to_onnx
+
+        mid = "arnir0/Tiny-LLM"
+        data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
+        prompt = data["inputs_prompt"]["input_ids"]
+        self.assertEqual(data["task"], "text-generation")
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        with torch_export_patches(patch_transformers=True, verbose=1, patch_torch=False):
+            onx = to_onnx(model, inputs, dynamic_shapes=ds)
+
+        self.dump_onnx("test_text_generation_tiny_llm_prompt_validation.onnx", onx)
+        onnx_sequence = onnx_generate(onx, prompt, max_new_tokens=3)
+        torch_sequence = generate_and_validate(model, prompt, max_new_tokens=3)
+        self.assertEqualArray(torch_sequence, onnx_sequence)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers/rt_helper.py b/onnx_diagnostic/helpers/rt_helper.py
@@ -149,7 +149,7 @@ def make_empty_cache(
 def generate_and_validate(
     model,
     input_ids: torch.Tensor,
-    eos_token_id: int,
+    eos_token_id: int = 2,
     max_new_tokens: int = 100,
     session: Optional[Union[InferenceSessionForTorch, onnx.ModelProto, str]] = None,
     atol: float = 0.1,
@@ -258,10 +258,10 @@ def generate_and_validate(
 def onnx_generate(
     model_or_path: Union[onnx.ModelProto, str, InferenceSessionForTorch],
     input_ids: torch.Tensor,
-    eos_token_id: int,
+    eos_token_id: int = 2,
     max_new_tokens=100,
     return_session: bool = False,
-) -> Union[torch.Tensor, Tuple[torch.Tensor, InferenceSessionForTorch]]:
+) -> Union[torch.Tensor, Tuple[torch.Tensor, InferenceSessionForTorch, Dict[str, Any]]]:
     """
     Implements a simple method ``generate`` for an ONNX model.
     The function does not expect any ``position_ids`` as input.
@@ -273,7 +273,7 @@ def onnx_generate(
     :param return_session: returns the instance of class
         :class:`InferenceSessionForTorch
         <onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch>`
-        created if necessary
+        created if necessary, the function returns the feeds for the next iteration
     :return: input tokens concatenated with new tokens
 
     .. runpython::
@@ -349,12 +349,19 @@ def onnx_generate(
     input_shapes = session.input_shapes
     input_names = session.input_names
     input_types = session.input_types
+    has_position_ids = "position_ids" in session.input_names
 
     assert (
         len(input_names) > 2
         and input_names[:2] == ["input_ids", "attention_mask"]
-        and input_names[2].startswith("past_key_values")
-    ), f"Only text generation is supported but input_names == {input_names}"
+        and input_names[3 if has_position_ids else 2].startswith("past_key_values")
+    ), (
+        f"Only text generation is supported but input_names == {input_names}, "
+        f"has_position_ids={has_position_ids}"
+    )
+    assert (
+        not has_position_ids or input_names[2] == "position_ids"
+    ), f"position_ids must the third input but input_names={input_names}"
 
     # First call: prefill
     feeds = dict(
@@ -366,6 +373,10 @@ def onnx_generate(
             input_ids.shape[0], input_names[2:], input_shapes[2:], input_types[2:]
         ),
     )
+    if has_position_ids:
+        feeds["position_ids"] = torch.unsqueeze(
+            torch.arange(input_ids.shape[1], dtype=torch.int64, device=input_ids.device), 0
+        )
 
     outputs = session.run(None, feeds)
 
@@ -389,11 +400,21 @@ def onnx_generate(
                 input_ids.shape, dtype=input_ids.dtype, device=input_ids.device
             ),
         )
-        feeds.update(dict(zip(input_names[2:], outputs[1:])))
+        if has_position_ids:
+            feeds["position_ids"] = torch.unsqueeze(
+                torch.arange(
+                    input_ids.shape[1],
+                    input_ids.shape[1] + 1,
+                    dtype=torch.int64,
+                    device=input_ids.device,
+                ),
+                0,
+            )
+        feeds.update(dict(zip(input_names[3 if has_position_ids else 2 :], outputs[1:])))
         outputs = session.run(None, feeds)
 
     if return_session:
-        return input_ids, session
+        return input_ids, session, feeds
     return input_ids
 
 
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
@@ -56,6 +56,74 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
     return kwargs
 
 
+def _get_input_falcon_mamba(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_hidden_layers: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    sequence_length2: int = 3,
+    dynamic_rope: bool = False,
+    num_key_value_heads: Optional[int] = None,
+    head_dim: Optional[int] = None,
+    cls_cache: Optional[Union[type, str]] = None,
+    **kwargs,  # unused
+):
+    try:
+        from transformers.models.mamba.modeling_mamba import MambaCache
+    except ImportError:
+        from transformers.cache_utils import MambaCache
+
+    assert cls_cache in (
+        "MambaCache",
+        MambaCache,
+    ), f"Unexpected value for cls_cache={cls_cache} and config={config}"
+
+    batch = "batch"
+    seq_length_multiple = 8
+    sequence_length = (
+        (sequence_length + seq_length_multiple) // seq_length_multiple * seq_length_multiple
+    )
+    # sequence_inc = seq_length_multiple
+    sequence_length2 = seq_length_multiple
+
+    shapes = {
+        "input_ids": {0: batch, 1: "sequence_length"},
+        "attention_mask": {
+            0: batch,
+            1: "cache+seq",  # cache_length + seq_length
+        },
+        "cache_position": {
+            0: batch,
+            1: "cache+seq",  # cache_length + seq_length
+        },
+        "cache_params": [{0: batch} for _ in range(num_hidden_layers * 2)],
+    }
+    inputs = dict(
+        input_ids=torch.randint(
+            0, dummy_max_token_id, (batch_size, sequence_length + sequence_length2)
+        ).to(torch.int64),
+        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
+            torch.int64
+        ),
+        cache_position=torch.arange(0, kwargs["conv_kernel"]).to(torch.int64),
+        # .expand((batch_size, -1))
+        cache_params=make_mamba_cache(
+            [
+                (
+                    torch.randn(
+                        batch_size, kwargs["intermediate_size"], kwargs["conv_kernel"]
+                    ),
+                    torch.randn(batch_size, kwargs["intermediate_size"], kwargs["state_size"]),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
 def get_inputs(
     model: torch.nn.Module,
     config: Optional[Any],
@@ -68,7 +136,7 @@ def get_inputs(
     num_key_value_heads: Optional[int] = None,
     head_dim: Optional[int] = None,
     cls_cache: Optional[Union[type, str]] = None,
-    add_second_input: int = 1,
+    add_second_input: Optional[int] = None,
     **kwargs,  # unused
 ):
     """
@@ -84,67 +152,28 @@ def get_inputs(
     :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
     :param cls_cache: cache class, by default it is
         :class:`transformers.cache_utils.DynamicCache`
+    :param add_second_input: adds other kinds of inputs
     :return: dictionary
     """
     batch = "batch"
     seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
     cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
 
     if config is not None and config.__class__.__name__ == "FalconMambaConfig":
-        try:
-            from transformers.models.mamba.modeling_mamba import MambaCache
-        except ImportError:
-            from transformers.cache_utils import MambaCache
-
-        assert cls_cache in (
-            "MambaCache",
-            MambaCache,
-        ), f"Unexpected value for cls_cache={cls_cache} and config={config}"
-        seq_length_multiple = 8
-        sequence_length = (
-            (sequence_length + seq_length_multiple)
-            // seq_length_multiple
-            * seq_length_multiple
-        )
-        # sequence_inc = seq_length_multiple
-        sequence_length2 = seq_length_multiple
-
-        shapes = {
-            "input_ids": {0: batch, 1: "sequence_length"},
-            "attention_mask": {
-                0: batch,
-                1: "cache+seq",  # cache_length + seq_length
-            },
-            "cache_position": {
-                0: batch,
-                1: "cache+seq",  # cache_length + seq_length
-            },
-            "cache_params": [{0: batch} for _ in range(num_hidden_layers * 2)],
-        }
-        inputs = dict(
-            input_ids=torch.randint(
-                0, dummy_max_token_id, (batch_size, sequence_length + sequence_length2)
-            ).to(torch.int64),
-            attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
-                torch.int64
-            ),
-            cache_position=torch.arange(0, kwargs["conv_kernel"]).to(torch.int64),
-            # .expand((batch_size, -1))
-            cache_params=make_mamba_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, kwargs["intermediate_size"], kwargs["conv_kernel"]
-                        ),
-                        torch.randn(
-                            batch_size, kwargs["intermediate_size"], kwargs["state_size"]
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
+        res = _get_input_falcon_mamba(
+            model=model,
+            config=config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_hidden_layers=num_hidden_layers,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            sequence_length2=sequence_length2,
+            dynamic_rope=dynamic_rope,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            cls_cache=cls_cache,
+            **kwargs,  # unused
         )
-        res = dict(inputs=inputs, dynamic_shapes=shapes)
     else:
         if head_dim is None:
             assert config, "head_dim is None, the value cannot be set without a configuration"
@@ -244,6 +273,7 @@ def get_inputs(
             )
         res = dict(inputs=inputs, dynamic_shapes=shapes)
     if add_second_input:
+        res["inputs_prompt"] = dict(input_ids=torch.randint(1000, 30000, (1, 11)))
         res["inputs2"] = get_inputs(
             model=model,
             config=config,
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_torch.py b/onnx_diagnostic/torch_export_patches/patches/patch_torch.py
@@ -195,9 +195,12 @@ def _check_frozen(
         if self.frozen:
             self.counter["ignored_backward_guard"] += 1
             # PATCHED: raised an exception instead of logging.
+            import transformers
+
             raise AssertionError(
                 f"[patched_ShapeEnv] Ignored guard {expr} == {concrete_val}, "
-                f"this could result in accuracy problems"
+                f"this could result in accuracy problems, transformers.__version__="
+                f"{transformers.__version__!r}"
             )
 
     def _set_replacement(
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py
@@ -1463,6 +1463,9 @@ def _mk(key, flavour=flavour):
     if verbose:
         print(f"[validate_onnx_model] -- keys={keys}")
     for k_input, k_expected, suffix in keys:
+        if k_input == "inputs_prompt":
+            # this must used onnx_generate
+            continue
         # make_feeds
         assert k_input in data, f"Unable to find {k_input!r} in {sorted(data)}"
         assert k_expected in data, f"Unable to find {k_expected!r} in {sorted(data)}"

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ def simple_generate_with_cache(`
`186`	`186`	`# seen earlier for a torch model.`
`187`	`187`	`# Let's ask first the function to return the session to avoid creating on the second call.`
`188`	`188`
`189`		`-_res, session = onnx_generate(`
	`189`	`+_res, session, _feeds = onnx_generate(`
`190`	`190`	`model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True`
`191`	`191`	`)`
`192`	`192`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def test_onnx_generate(self):`
`48`	`48`	`)`
`49`	`49`
`50`	`50`	`print("-- test_onnx_generate: generate")`
`51`		`- res, session = onnx_generate(`
	`51`	`+ res, session, _feeds = onnx_generate(`
`52`	`52`	`model_name, input_ids[:1], 2, max_new_tokens=10, return_session=True`
`53`	`53`	`)`
`54`	`54`	`n_inputs = input_ids.shape[1]`