From 126e585a01d063eac16d6b49f3cc79d7082d255d Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 3 Oct 2025 15:19:41 +0200
Subject: [PATCH 1/4] Add set of inputs for empty cache

---
 _unittests/ut_tasks/test_tasks.py             |  20 ++
 _unittests/ut_tasks/try_tasks.py              |  44 +++++
 onnx_diagnostic/helpers/torch_helper.py       |   7 +-
 onnx_diagnostic/tasks/text_generation.py      |  15 ++
 .../patches/patch_transformers.py             | 186 +++++++++++++++++-
 .../torch_models/hghub/model_inputs.py        |   2 +-
 onnx_diagnostic/torch_models/validate.py      |  75 ++++---
 7 files changed, 316 insertions(+), 33 deletions(-)

diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
index 57c83b55..1bb71bff 100644
--- a/_unittests/ut_tasks/test_tasks.py
+++ b/_unittests/ut_tasks/test_tasks.py
@@ -48,6 +48,26 @@ def test_text_generation(self):
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
             )
 
+    def test_text_generation_empty_cache(self):
+        mid = "arnir0/Tiny-LLM"
+        data = get_untrained_model_with_inputs(mid, add_second_input=True)
+        model, inputs = data["model"], data["inputs"]
+        self.assertIn("inputs_empty_cache", data)
+        empty_inputs = torch_deepcopy(data["inputs_empty_cache"])
+        expected = model(**empty_inputs)
+        self.assertEqual(
+            {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
+        )
+        with torch_export_patches(patch_transformers=True, verbose=1):
+            ep = torch.export.export(
+                model,
+                (),
+                kwargs=inputs,
+                dynamic_shapes=use_dyn_not_str(data["dynamic_shapes"]),
+            )
+            got = ep.module()(**inputs)
+            self.assertEqualArrayAny(expected, got)
+
     @hide_stdout()
     def test_automatic_speech_recognition_float32(self):
         mid = "openai/whisper-tiny"
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
index f291633a..1e498d27 100644
--- a/_unittests/ut_tasks/try_tasks.py
+++ b/_unittests/ut_tasks/try_tasks.py
@@ -4,6 +4,7 @@
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
 from onnx_diagnostic.helpers.torch_helper import steal_forward
+from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 
 
@@ -130,6 +131,49 @@ def test_text2text_generation_static(self):
             )
         print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
 
+    @never_test()
+    def test_text_generation_tiny_llm(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k tiny_llm
+        """
+        dict(cache_position:T7s21,
+             past_key_values:DynamicCache(key_cache=#0[], value_cache=#0[]),
+             input_ids:T7s1x21,
+             position_ids:T7s1x21
+             attention_mask:T1s1x21)
+        dict(cache_position:T7s1,
+             past_key_values:DynamicCache(key_cache=#32[T1s1x8x21x128,...],
+                                          value_cache=#32[T1s1x8x21x128,...]),
+             input_ids:T7s1x21,
+             position_ids:T7s1x1
+             attention_mask:T1s1x1)
+        """
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+
+        tokenizer = AutoTokenizer.from_pretrained("arnir0/Tiny-LLM")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct")
+
+        text = "def greet(user): print(f'hello <extra_id_0>!')"
+        input_ids = tokenizer(text, return_tensors="pt").input_ids.reshape((1, -1))
+        mask = (
+            torch.tensor([1 for i in range(input_ids.shape[1])])
+            .to(torch.int64)
+            .reshape((1, -1))
+        )
+        position_ids = torch.arange(input_ids.shape[1], dtype=torch.int64).reshape((1, -1))
+
+        # simply generate a single sequence
+        print()
+        with torch_export_patches(
+            patch_transformers=True, patch_torch=False, patch_sympy=False
+        ), steal_forward(model):
+            generated_ids = model.generate(
+                input_ids=input_ids,
+                max_length=100,
+                attention_mask=mask,
+                position_ids=position_ids,
+            )
+        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
+
     @never_test()
     def test_text_generation_phi4_mini(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
index 734aba76..15ab4ae7 100644
--- a/onnx_diagnostic/helpers/torch_helper.py
+++ b/onnx_diagnostic/helpers/torch_helper.py
@@ -765,7 +765,12 @@ def to_any(value: Any, to_value: Union[torch.dtype, torch.device, str]) -> Any:
 
 
 def torch_deepcopy(value: Any) -> Any:
-    """Makes a deepcopy."""
+    """
+    Makes a deep copy.
+
+    :param value: any value
+    :return: a deep copy
+    """
     if value is None:
         return None
     if isinstance(value, (int, float, str)):
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
index 6e6e29ba..964e0462 100644
--- a/onnx_diagnostic/tasks/text_generation.py
+++ b/onnx_diagnostic/tasks/text_generation.py
@@ -269,6 +269,21 @@ def get_inputs(
             add_second_input=0,
             **kwargs,
         )["inputs"]
+        res["inputs_empty_cache"] = get_inputs(
+            model=model,
+            config=config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_hidden_layers=num_hidden_layers,
+            batch_size=batch_size,
+            sequence_length=0,
+            sequence_length2=sequence_length2,
+            dynamic_rope=dynamic_rope,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            cls_cache=cls_cache,
+            add_second_input=0,
+            **kwargs,
+        )["inputs"]
     return res
 
 
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
index e95a0a47..467abc62 100644
--- a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
+++ b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1,13 +1,22 @@
 import inspect
 import math
+import os
 from dataclasses import dataclass
 from functools import wraps
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, Union
 import packaging.version as pv
 import torch
 import transformers
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.cache_utils import StaticCache, Cache
+from transformers.generation.utils import (
+    GenerateDecoderOnlyOutput,
+    GenerateEncoderDecoderOutput,
+    GenerateNonBeamOutput,
+    GenerationConfig,
+    StoppingCriteriaList,
+    LogitsProcessorList,
+)
 
 try:
     from transformers.cache_utils import parse_processor_args  # noqa: F401
@@ -456,6 +465,11 @@ class patched_GenerationMixin:
         "_cache_dependant_input_preparation",
         "_cache_dependant_input_preparation_exporting",
         "prepare_inputs_for_generation",
+        (
+            "_sample"
+            if pv.Version(transformers.__version__) == pv.Version("4.57.0.dev0")
+            else None
+        ),
     ]
     _PATCHED_CLASS_ = transformers.generation.utils.GenerationMixin
 
@@ -588,7 +602,7 @@ def prepare_inputs_for_generation(
         model_inputs = {}
         # - some models don't have `Cache` support
         # (which implies they don't expect `cache_position` in `forward`)
-        if self._supports_cache_class:
+        if getattr(self, "_supports_cache_class", False):
             model_inputs["cache_position"] = cache_position
         # - `cache_position` was not a mandatory input in
         # `prepare_inputs_for_generation` for those models, and this
@@ -728,6 +742,174 @@ def prepare_inputs_for_generation(
         model_inputs.pop("labels", None)
         return model_inputs
 
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,  # noqa: F821
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(
+            hasattr(criteria, "eos_token_id") for criteria in stopping_criteria
+        )
+        do_sample = generation_config.do_sample
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = (
+            () if (return_dict_in_generate and output_hidden_states) else None
+        )
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = (
+                model_kwargs["encoder_outputs"].get("attentions")
+                if output_attentions
+                else None
+            )
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states")
+                if output_hidden_states
+                else None
+            )
+
+        # keep track of which sequences are already finished
+        batch_size, cur_len = input_ids.shape[:2]
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(
+            batch_size, dtype=torch.long, device=input_ids.device
+        )
+        model_kwargs = self._get_initial_cache_position(
+            cur_len, input_ids.device, model_kwargs
+        )
+
+        model_forward = self.__call__
+        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
+        if compile_forward:
+            os.environ["TOKENIZERS_PARALLELISM"] = "0"
+            # If we use FA2 and a static cache, we cannot compile with fullgraph
+            model_forward = self.get_compiled_call(generation_config.compile_config)
+
+        if generation_config.prefill_chunk_size is not None:
+            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
+            is_prefill = False
+        else:
+            is_prefill = True
+
+        while self._has_unfinished_sequences(
+            this_peer_finished, synced_gpus, device=input_ids.device
+        ):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            if is_prefill:
+                outputs = self(**model_inputs, return_dict=True)
+                is_prefill = False
+            else:
+                outputs = model_forward(**model_inputs, return_dict=True)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+
+            next_token_logits = outputs.logits[:, -1, :].to(
+                copy=True, dtype=torch.float32, device=input_ids.device
+            )
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # token selection
+            if do_sample:
+                probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
+                    1 - unfinished_sequences
+                )
+
+            # update generated ids, model inputs, and length for next step
+            # PATCHED: dimension issues when calling generate method
+            input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            del outputs
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
 
 def patched__compute_dynamic_ntk_parameters(
     config: Optional[transformers.PretrainedConfig] = None,
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
index dde656a1..820f2e3f 100644
--- a/onnx_diagnostic/torch_models/hghub/model_inputs.py
+++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -57,7 +57,7 @@ def get_untrained_model_with_inputs(
         to get a smaller model
     :param use_pretrained: download the pretrained weights as well
     :param use_preinstalled: use preinstalled configurations
-    :param add_second_input: provides a second inputs to check a model
+    :param add_second_input: provides others inputs to check a model
         supports different shapes
     :param subfolder: subfolder to use for this model id
     :param use_only_preinstalled: use only preinstalled version
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py
index 88feb5f6..d2aaf012 100644
--- a/onnx_diagnostic/torch_models/validate.py
+++ b/onnx_diagnostic/torch_models/validate.py
@@ -359,9 +359,10 @@ def validate_model(
         ``orteval10``, ``ref`` only if `do_run` is true
     :param repeat: number of time to measure the model
     :param warmup: warmup the model first
-    :param inputs2: checks that the second set of inputs is reunning as well,
+    :param inputs2: checks that other sets of inputs are running as well,
         this ensures that the model does support dynamism, the value is used
-        as an increment to the first set of values (added to dimensions)
+        as an increment to the first set of values (added to dimensions),
+        or an empty cache for example
     :param output_names: output names the onnx exporter should use
     :param ort_logs: increases onnxruntime verbosity when creating the session
     :return: two dictionaries, one with some metrics,
@@ -391,6 +392,10 @@ def validate_model(
     :class:`onnx_diagnostic.reference.ExtendedReferenceEvaluator`
     if ``runtime == 'ref'``,
     ``orteval10`` increases the verbosity.
+
+    .. versionchanged:: 0.7.13
+        *inputs2* not only means a second set of inputs such as ``input_empty_cache``
+        which refers to a set of inputs using an empty cache.
     """
     model_id, subfolder, same_as_pretrained, use_pretrained = _preprocess_model_id(
         model_id,
@@ -505,10 +510,9 @@ def validate_model(
             )
         ),
     )
-    assert not inputs2 or "inputs2" in data, (
-        f"inputs2 is True but second set is missing in data for "
-        f"model id {model_id!r}: {sorted(data)}"
-    )
+
+    second_input_keys = [k for k in data if k.startswith("inputs") and k != "inputs"]
+
     if dump_folder:
         with open(os.path.join(dump_folder, "model_config.txt"), "w") as f:
             f.write(f"model_id: {model_id}\n------\n")
@@ -601,16 +605,14 @@ def validate_model(
         if verbose:
             print(f"[validate_model] new inputs: {string_type(data['inputs'])}")
             print(f"[validate_model] new dynamic_hapes: {string_type(data['dynamic_shapes'])}")
-        if inputs2:
-            assert (
-                "inputs2" in data
-            ), "Cannot test a second set of inputs as it was not defined."
-            data["inputs2"], _ = filter_inputs(
-                data["inputs2"],
-                drop_names=drop_inputs,
-                model=data["model"],
-                dynamic_shapes=data["dynamic_shapes"],
-            )
+        if second_input_keys:
+            for k in second_input_keys:
+                data[k], _ = filter_inputs(
+                    data[k],
+                    drop_names=drop_inputs,
+                    model=data["model"],
+                    dynamic_shapes=data["dynamic_shapes"],
+                )
 
     if not empty(dtype):
         if isinstance(dtype, str):
@@ -620,8 +622,9 @@ def validate_model(
         data["model"] = to_any(data["model"], dtype)  # type: ignore
         data["inputs"] = to_any(data["inputs"], dtype)  # type: ignore
         summary["model_dtype"] = str(dtype)
-        if "inputs2" in data:
-            data["inputs2"] = to_any(data["inputs2"], dtype)  # type: ignore
+        if second_input_keys:
+            for k in second_input_keys:
+                data[k] = to_any(data[k], dtype)  # type: ignore
 
     if not empty(device):
         if verbose:
@@ -629,11 +632,13 @@ def validate_model(
         data["model"] = to_any(data["model"], device)  # type: ignore
         data["inputs"] = to_any(data["inputs"], device)  # type: ignore
         summary["model_device"] = str(device)
-        if "inputs2" in data:
-            data["inputs2"] = to_any(data["inputs2"], device)  # type: ignore
+        if second_input_keys:
+            for k in second_input_keys:
+                data[k] = to_any(data[k], device)  # type: ignore
 
     for k in ["task", "size", "n_weights"]:
         summary[f"model_{k.replace('_','')}"] = data[k]
+    summary["second_input_keys"] = ",".join(second_input_keys)
     summary["model_inputs_options"] = str(input_options or "")
     summary["model_inputs"] = string_type(data["inputs"], with_shape=True)
     summary["model_shapes"] = string_type(data["dynamic_shapes"])
@@ -660,16 +665,26 @@ def validate_model(
             print(f"[validate_model] +INPUT {k}={string_type(v, with_shape=True)}")
         for k, v in data["dynamic_shapes"].items():
             print(f"[validate_model] +SHAPE {k}={string_type(v)}")
+        print(f"[validate_model] second_input_keys={second_input_keys}")
         print("[validate_model] --")
 
     if do_run:
         _validate_do_run_model(
             data, summary, "inputs", "run", "run_expected", verbose, repeat, warmup, quiet
         )
-        if inputs2:
-            _validate_do_run_model(
-                data, summary, "inputs2", "run2", "run_expected2", verbose, 1, 0, quiet
-            )
+        if second_input_keys:
+            for k in second_input_keys:
+                _validate_do_run_model(
+                    data,
+                    summary,
+                    k,
+                    f"run2{k[6:]}",
+                    f"run_expected2{k[6:]}",
+                    verbose,
+                    1,
+                    0,
+                    quiet,
+                )
 
     if exporter:
         print(
@@ -788,7 +803,7 @@ def validate_model(
             runtime=runtime,
             repeat=repeat,
             warmup=warmup,
-            inputs2=inputs2,
+            second_input_keys=second_input_keys,
             ort_logs=ort_logs,
         )
         summary.update(summary_valid)
@@ -1232,7 +1247,7 @@ def validate_onnx_model(
     runtime: str = "onnxruntime",
     repeat: int = 1,
     warmup: int = 0,
-    inputs2: int = 1,
+    second_input_keys: Optional[List[str]] = None,
     ort_logs: bool = False,
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """
@@ -1249,7 +1264,7 @@ def validate_onnx_model(
     :param runtime: onnx runtime to use, onnxruntime, torch, orteval, ref
     :param repeat: run that number of times the model
     :param warmup: warmup the model
-    :param inputs2: to validate the model on the second input set
+    :param second_input_keys: to validate the model on other input sets
         to make sure the exported model supports dynamism, the value is
         used as an increment added to the first set of inputs (added to dimensions)
     :param ort_logs: triggers the logs for onnxruntime
@@ -1374,10 +1389,12 @@ def _mk(key, flavour=flavour):
         print(f"[validate_onnx_model] done (ort_session) flavour={flavour!r}")
 
     keys = [("inputs", "run_expected", "")]
-    if inputs2:
-        keys.append(("inputs2", "run_expected2", "2"))
+    if second_input_keys:
+        keys.extend([(k, f"run_expected2{k[6:]}", f"2{k[6:]}") for k in second_input_keys])
     for k_input, k_expected, suffix in keys:
         # make_feeds
+        assert k_input in data, f"Unable to find {k_input!r} in {sorted(data)}"
+        assert k_expected in data, f"Unable to find {k_expected!r} in {sorted(data)}"
         if verbose:
             print(f"[validate_onnx_model] -- make_feeds for {k_input!r}...")
             print(

From 05b06e9e385338231615eb7d00efa45ef729da59 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 3 Oct 2025 15:30:48 +0200
Subject: [PATCH 2/4] patches

---
 .github/workflows/check-urls.yml                   |  4 ++--
 .github/workflows/ci.yml                           | 14 ++++++--------
 CHANGELOGS.rst                                     |  1 +
 _doc/patches.rst                                   |  5 ++++-
 _doc/status/patches_coverage.rst                   |  5 ++++-
 .../torch_export_patches/onnx_export_errors.py     |  2 +-
 onnx_diagnostic/torch_models/validate.py           |  2 +-
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml
index 49c2bfbb..faf0ea99 100644
--- a/.github/workflows/check-urls.yml
+++ b/.github/workflows/check-urls.yml
@@ -42,6 +42,6 @@ jobs:
         print_all: false
         timeout: 2
         retry_count# : 2
-        exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311
-        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/
+        exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311,https://www.linux.org/
+        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/,https://www.linux.org/
         # force_pass : true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 20f4fdfd..b89177a9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python: ['3.10', '3.11', '3.12', '3.13']
-        transformers: ['4.48.3', '4.51.3', '4.52.4', '4.53.3', '4.55.4', '4.56.1', 'main']
+        transformers: ['4.48.3', '4.51.3', '4.52.4', '4.55.4', '4.56.2', 'main']
         torch: ['2.8', 'main']
         exclude:
           - python: '3.10'
@@ -26,22 +26,18 @@ jobs:
             transformers: 'main'
           - python: '3.10'
             transformers: '4.52.4'
-          - python: '3.10'
-            transformers: '4.53.3'
           - python: '3.10'
             transformers: '4.55.4'
           - python: '3.10'
-            transformers: '4.56.1'
+            transformers: '4.56.2'
           - python: '3.11'
             torch: 'main'
-          - python: '3.11'
-            transformers: '4.53.3'
           - python: '3.11'
             transformers: 'main'
           - python: '3.11'
             transformers: '4.55.4'
           - python: '3.11'
-            transformers: '4.56.1'
+            transformers: '4.56.2'
           - python: '3.13'
             torch: '2.8'
           - python: '3.13'
@@ -49,7 +45,9 @@ jobs:
           - python: '3.13'
             transformers: '4.51.3'
           - python: '3.13'
-            transformers: '4.52.4'
+            transformers: '4.55.4'
+          - python: '3.13'
+            transformers: '4.56.2'
     steps:
       - uses: actions/checkout@v3
 
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index dd498636..d8494f05 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.7.13
 ++++++
 
+* :pr:`256`: add a set of inputs checking models works for an empty cache on task text-generation
 * :pr:`244`: add a patch to bypass the exception raised when the dynamic dimension is in {0,1}
 
 0.7.12
diff --git a/_doc/patches.rst b/_doc/patches.rst
index d174c3df..a397d11c 100644
--- a/_doc/patches.rst
+++ b/_doc/patches.rst
@@ -91,7 +91,10 @@ Here is the list of available patches:
 
     for name, cls in p.__dict__.items():
         if name.startswith("patched_") and hasattr(cls, "_PATCHES_"):
-            print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}")
+            print(
+                f"{cls._PATCHED_CLASS_.__name__}: "
+                f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}"
+            )
 
 Cache serialization
 ===================
diff --git a/_doc/status/patches_coverage.rst b/_doc/status/patches_coverage.rst
index 61d5b775..c2cdc37f 100644
--- a/_doc/status/patches_coverage.rst
+++ b/_doc/status/patches_coverage.rst
@@ -32,7 +32,10 @@ for transformers.
 
     for name, cls in p.__dict__.items():
         if name.startswith("patched_") and hasattr(cls, "_PATCHES_"):
-            print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}")
+            print(
+                f"{cls._PATCHED_CLASS_.__name__}: "
+                f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}"
+            )
 
 Half Automated Rewrites for Control Flows
 =========================================
diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
index 210e98e8..10bb006e 100644
--- a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
+++ b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
@@ -83,7 +83,7 @@ def patch_module_or_classes(mod, verbose: int = 0) -> Dict[type, Dict[type, Call
             continue
 
         original = cls._PATCHED_CLASS_
-        methods = cls._PATCHES_
+        methods = [_ for _ in cls._PATCHES_ if _ is not None]
         if verbose:
             print(f"[patch_module_or_classes] {name}.{cls.__name__}: {', '.join(methods)}")
 
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py
index d2aaf012..90d3f36b 100644
--- a/onnx_diagnostic/torch_models/validate.py
+++ b/onnx_diagnostic/torch_models/validate.py
@@ -865,7 +865,7 @@ def validate_model(
                     runtime=runtime,
                     repeat=repeat,
                     warmup=warmup,
-                    inputs2=inputs2,
+                    second_input_keys=second_input_keys,
                 )
                 summary.update(summary_valid)
 

From 2be018c6f96735223e1b4a3897014d0be6c5553c Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 3 Oct 2025 17:09:05 +0200
Subject: [PATCH 3/4] fix

---
 CHANGELOGS.rst                           | 3 ++-
 _unittests/ut_tasks/test_tasks.py        | 7 ++++---
 onnx_diagnostic/torch_models/validate.py | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index c2abc44a..bf6e231c 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -4,7 +4,8 @@ Change Logs
 0.7.13
 ++++++
 
-* :pr:`256`: add a set of inputs checking models works for an empty cache on task text-generation
+* :pr:`247`: supports more gemma models with ModelBuilder
+* :pr:`246`: add a set of inputs checking models works for an empty cache on task text-generation
 * :pr:`237`: dummy inputs for google/gemma-3-4b-it
 * :pr:`244`: add a patch to bypass the exception raised when the dynamic dimension is in {0,1}
 
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
index 1bb71bff..975372ec 100644
--- a/_unittests/ut_tasks/test_tasks.py
+++ b/_unittests/ut_tasks/test_tasks.py
@@ -54,7 +54,8 @@ def test_text_generation_empty_cache(self):
         model, inputs = data["model"], data["inputs"]
         self.assertIn("inputs_empty_cache", data)
         empty_inputs = torch_deepcopy(data["inputs_empty_cache"])
-        expected = model(**empty_inputs)
+        model(**torch_deepcopy(empty_inputs))
+        expected = model(**torch_deepcopy(inputs))
         self.assertEqual(
             {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
         )
@@ -62,10 +63,10 @@ def test_text_generation_empty_cache(self):
             ep = torch.export.export(
                 model,
                 (),
-                kwargs=inputs,
+                kwargs=torch_deepcopy(inputs),
                 dynamic_shapes=use_dyn_not_str(data["dynamic_shapes"]),
             )
-            got = ep.module()(**inputs)
+            got = ep.module()(**torch_deepcopy(inputs))
             self.assertEqualArrayAny(expected, got)
 
     @hide_stdout()
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py
index 22cbf117..719f3828 100644
--- a/onnx_diagnostic/torch_models/validate.py
+++ b/onnx_diagnostic/torch_models/validate.py
@@ -415,7 +415,8 @@ def validate_model(
     ``orteval10`` increases the verbosity.
 
     .. versionchanged:: 0.7.13
-        *inputs2* not only means a second set of inputs such as ``input_empty_cache``
+        *inputs2* not only means a second set of inputs but many
+        such as ``input_empty_cache``
         which refers to a set of inputs using an empty cache.
     """
     validation_begin = time.perf_counter()

From 6a41c09cd8925c65a12e57067e7b325a850d5cc1 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 3 Oct 2025 17:35:18 +0200
Subject: [PATCH 4/4] fix

---
 _unittests/ut_torch_models/test_validate_whole_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_unittests/ut_torch_models/test_validate_whole_models.py b/_unittests/ut_torch_models/test_validate_whole_models.py
index a29be5cf..830ee0b2 100644
--- a/_unittests/ut_torch_models/test_validate_whole_models.py
+++ b/_unittests/ut_torch_models/test_validate_whole_models.py
@@ -227,7 +227,7 @@ def test_m_validate_model_vit_model(self):
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)
         self.assertLess(summary["disc_onnx_ort_run_abs"], 1e-3)
-        self.assertLess(summary["disc_onnx_ort_run2_abs"], 1e-3)
+        self.assertLess(summary["disc_onnx_ort_run22_abs"], 1e-3)
         self.assertEqual("dict(pixel_values:A1s2x3x30x30)", summary["run_feeds_inputs"])
         self.assertEqual("dict(pixel_values:A1s3x3x31x31)", summary["run_feeds_inputs2"])
         self.assertEqual("#1[A1s2x2]", summary["run_output_inputs"])