Add set of inputs for empty cache (#246)

xadupre · web-flow · commit d8e0dd848421 · 2025-10-03T17:56:47.000+02:00
* Add set of inputs for empty cache

* patches

* fix

* fix
diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml
@@ -42,6 +42,6 @@ jobs:
         print_all: false
         timeout: 2
         retry_count# : 2
-        exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311
-        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/
+        exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311,https://www.linux.org/
+        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/,https://codecov.io/,https://huggingface.co/,https://www.linux.org/
         # force_pass : true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python: ['3.10', '3.11', '3.12', '3.13']
-        transformers: ['4.48.3', '4.51.3', '4.52.4', '4.53.3', '4.55.4', '4.56.1', 'main']
+        transformers: ['4.48.3', '4.51.3', '4.52.4', '4.55.4', '4.56.2', 'main']
         torch: ['2.8', 'main']
         exclude:
           - python: '3.10'
@@ -26,30 +26,28 @@ jobs:
             transformers: 'main'
           - python: '3.10'
             transformers: '4.52.4'
-          - python: '3.10'
-            transformers: '4.53.3'
           - python: '3.10'
             transformers: '4.55.4'
           - python: '3.10'
-            transformers: '4.56.1'
+            transformers: '4.56.2'
           - python: '3.11'
             torch: 'main'
-          - python: '3.11'
-            transformers: '4.53.3'
           - python: '3.11'
             transformers: 'main'
           - python: '3.11'
             transformers: '4.55.4'
           - python: '3.11'
-            transformers: '4.56.1'
+            transformers: '4.56.2'
           - python: '3.13'
             torch: '2.8'
           - python: '3.13'
             transformers: '4.48.3'
           - python: '3.13'
             transformers: '4.51.3'
           - python: '3.13'
-            transformers: '4.52.4'
+            transformers: '4.55.4'
+          - python: '3.13'
+            transformers: '4.56.2'
     steps:
       - uses: actions/checkout@v3
 
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,8 @@ Change Logs
 0.7.13
 ++++++
 
+* :pr:`247`: supports more gemma models with ModelBuilder
+* :pr:`246`: add a set of inputs checking models works for an empty cache on task text-generation
 * :pr:`237`: dummy inputs for google/gemma-3-4b-it
 * :pr:`244`: add a patch to bypass the exception raised when the dynamic dimension is in {0,1}
 
diff --git a/_doc/patches.rst b/_doc/patches.rst
@@ -91,7 +91,10 @@ Here is the list of available patches:
 
     for name, cls in p.__dict__.items():
         if name.startswith("patched_") and hasattr(cls, "_PATCHES_"):
-            print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}")
+            print(
+                f"{cls._PATCHED_CLASS_.__name__}: "
+                f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}"
+            )
 
 Cache serialization
 ===================
diff --git a/_doc/status/patches_coverage.rst b/_doc/status/patches_coverage.rst
@@ -32,7 +32,10 @@ for transformers.
 
     for name, cls in p.__dict__.items():
         if name.startswith("patched_") and hasattr(cls, "_PATCHES_"):
-            print(f"{cls._PATCHED_CLASS_.__name__}: {', '.join(cls._PATCHES_)}")
+            print(
+                f"{cls._PATCHED_CLASS_.__name__}: "
+                f"{', '.join([_ for _ in cls._PATCHES_ if _ is not None])}"
+            )
 
 Half Automated Rewrites for Control Flows
 =========================================
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
@@ -48,6 +48,27 @@ def test_text_generation(self):
                 model, (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds), strict=False
             )
 
+    def test_text_generation_empty_cache(self):
+        mid = "arnir0/Tiny-LLM"
+        data = get_untrained_model_with_inputs(mid, add_second_input=True)
+        model, inputs = data["model"], data["inputs"]
+        self.assertIn("inputs_empty_cache", data)
+        empty_inputs = torch_deepcopy(data["inputs_empty_cache"])
+        model(**torch_deepcopy(empty_inputs))
+        expected = model(**torch_deepcopy(inputs))
+        self.assertEqual(
+            {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
+        )
+        with torch_export_patches(patch_transformers=True, verbose=1):
+            ep = torch.export.export(
+                model,
+                (),
+                kwargs=torch_deepcopy(inputs),
+                dynamic_shapes=use_dyn_not_str(data["dynamic_shapes"]),
+            )
+            got = ep.module()(**torch_deepcopy(inputs))
+            self.assertEqualArrayAny(expected, got)
+
     @hide_stdout()
     def test_automatic_speech_recognition_float32(self):
         mid = "openai/whisper-tiny"
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -5,8 +5,8 @@
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
 from onnx_diagnostic.helpers.torch_helper import steal_forward
-from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_export_patches import torch_export_patches
+from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 
 
 class TestHuggingFaceHubModel(ExtTestCase):
@@ -132,6 +132,52 @@ def test_text2text_generation_static(self):
             )
         print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
 
+    @never_test()
+    def test_text_generation_tiny_llm(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k tiny_llm
+        """
+        dict(cache_position:T7s21,
+             past_key_values:DynamicCache(key_cache=#0[], value_cache=#0[]),
+             input_ids:T7s1x21,
+             position_ids:T7s1x21
+             attention_mask:T1s1x21)
+        dict(cache_position:T7s1,
+             past_key_values:DynamicCache(key_cache=#32[T1s1x8x21x128,...],
+                                          value_cache=#32[T1s1x8x21x128,...]),
+             input_ids:T7s1x21,
+             position_ids:T7s1x1
+             attention_mask:T1s1x1)
+        """
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+
+        tokenizer = AutoTokenizer.from_pretrained("arnir0/Tiny-LLM")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct")
+
+        text = "def greet(user): print(f'hello <extra_id_0>!')"
+        input_ids = tokenizer(text, return_tensors="pt").input_ids.reshape((1, -1))
+        mask = (
+            torch.tensor([1 for i in range(input_ids.shape[1])])
+            .to(torch.int64)
+            .reshape((1, -1))
+        )
+        position_ids = torch.arange(input_ids.shape[1], dtype=torch.int64).reshape((1, -1))
+
+        # simply generate a single sequence
+        print()
+        with (
+            torch_export_patches(
+                patch_transformers=True, patch_torch=False, patch_sympy=False
+            ),
+            steal_forward(model),
+        ):
+            generated_ids = model.generate(
+                input_ids=input_ids,
+                max_length=100,
+                attention_mask=mask,
+                position_ids=position_ids,
+            )
+        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
+
     @never_test()
     def test_text_generation_phi4_mini(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini
diff --git a/_unittests/ut_torch_models/test_validate_whole_models.py b/_unittests/ut_torch_models/test_validate_whole_models.py
@@ -227,7 +227,7 @@ def test_m_validate_model_vit_model(self):
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)
         self.assertLess(summary["disc_onnx_ort_run_abs"], 1e-3)
-        self.assertLess(summary["disc_onnx_ort_run2_abs"], 1e-3)
+        self.assertLess(summary["disc_onnx_ort_run22_abs"], 1e-3)
         self.assertEqual("dict(pixel_values:A1s2x3x30x30)", summary["run_feeds_inputs"])
         self.assertEqual("dict(pixel_values:A1s3x3x31x31)", summary["run_feeds_inputs2"])
         self.assertEqual("#1[A1s2x2]", summary["run_output_inputs"])
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
@@ -779,7 +779,12 @@ def to_any(value: Any, to_value: Union[torch.dtype, torch.device, str]) -> Any:
 
 
 def torch_deepcopy(value: Any) -> Any:
-    """Makes a deepcopy."""
+    """
+    Makes a deep copy.
+
+    :param value: any value
+    :return: a deep copy
+    """
     if value is None:
         return None
     if isinstance(value, (int, float, str)):
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
@@ -269,6 +269,21 @@ def get_inputs(
             add_second_input=0,
             **kwargs,
         )["inputs"]
+        res["inputs_empty_cache"] = get_inputs(
+            model=model,
+            config=config,
+            dummy_max_token_id=dummy_max_token_id,
+            num_hidden_layers=num_hidden_layers,
+            batch_size=batch_size,
+            sequence_length=0,
+            sequence_length2=sequence_length2,
+            dynamic_rope=dynamic_rope,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            cls_cache=cls_cache,
+            add_second_input=0,
+            **kwargs,
+        )["inputs"]
     return res
 
 
diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
@@ -83,7 +83,7 @@ def patch_module_or_classes(mod, verbose: int = 0) -> Dict[type, Dict[type, Call
             continue
 
         original = cls._PATCHED_CLASS_
-        methods = cls._PATCHES_
+        methods = [_ for _ in cls._PATCHES_ if _ is not None]
         if verbose:
             print(f"[patch_module_or_classes] {name}.{cls.__name__}: {', '.join(methods)}")
 
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -9,6 +9,12 @@
 import transformers
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.cache_utils import StaticCache, Cache
+from transformers.generation.utils import (
+    GenerateNonBeamOutput,
+    GenerationConfig,
+    StoppingCriteriaList,
+    LogitsProcessorList,
+)
 
 try:
     from transformers.cache_utils import parse_processor_args  # noqa: F401
@@ -459,18 +465,18 @@ class patched_GenerationMixin:
     """
 
     _PATCHES_ = [
-        name
-        for name in [
-            "_cache_dependant_input_preparation",
-            "_cache_dependant_input_preparation_exporting",
-            (
-                None
-                if pv.Version(transformers.__version__) >= pv.Version("4.56")
-                else "prepare_inputs_for_generation"
-            ),
-            "_sample",
-        ]
-        if name is not None
+        "_cache_dependant_input_preparation",
+        "_cache_dependant_input_preparation_exporting",
+        (
+            None
+            if pv.Version(transformers.__version__) >= pv.Version("4.56")
+            else "prepare_inputs_for_generation"
+        ),
+        (
+            "_sample"
+            if pv.Version(transformers.__version__) == pv.Version("4.57.0.dev0")
+            else None
+        ),
     ]
     _PATCHED_CLASS_ = transformers.generation.utils.GenerationMixin
 
@@ -603,7 +609,7 @@ def prepare_inputs_for_generation(
         model_inputs = {}
         # - some models don't have `Cache` support
         # (which implies they don't expect `cache_position` in `forward`)
-        if self._supports_cache_class:
+        if getattr(self, "_supports_cache_class", False):
             model_inputs["cache_position"] = cache_position
         # - `cache_position` was not a mandatory input in
         # `prepare_inputs_for_generation` for those models, and this
@@ -832,8 +838,6 @@ def _sample(
             else:
                 outputs = model_forward(**model_inputs, return_dict=True)
 
-            # synced_gpus: don't waste resources running the code we don't need;
-            # kwargs must be updated before skipping
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs,
                 model_kwargs,
@@ -842,9 +846,6 @@ def _sample(
             if synced_gpus and this_peer_finished:
                 continue
 
-            # Copy is needed to avoid keeping a hanging ref to outputs.logits
-            # which may be very large for first iteration
-            # (the clone itself is always small)
             next_token_logits = outputs.logits[:, -1, :].to(
                 copy=True, dtype=torch.float32, device=input_ids.device
             )
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -57,7 +57,7 @@ def get_untrained_model_with_inputs(
         to get a smaller model
     :param use_pretrained: download the pretrained weights as well
     :param use_preinstalled: use preinstalled configurations
-    :param add_second_input: provides a second inputs to check a model
+    :param add_second_input: provides others inputs to check a model
         supports different shapes
     :param subfolder: subfolder to use for this model id
     :param use_only_preinstalled: use only preinstalled version
diff --git a/onnx_diagnostic/torch_models/validate.py b/onnx_diagnostic/torch_models/validate.py