diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml
index a1521060..c4445b17 100644
--- a/.github/workflows/check-urls.yml
+++ b/.github/workflows/check-urls.yml
@@ -31,7 +31,7 @@ jobs:
         timeout: 2
         retry_count# : 2
         exclude_urls: https://github.com/pytorch/pytorch/pull/117009,https://github.com/huggingface/transformers/pull/29285,https://github.com/pytorch/pytorch/blob/a44f8894fa6d973693aab44a3dda079a168b05c1/torch/_decomp/decompositions.py#L1475
-        exclude_patterns: https://dumps.wikimedia.org/,https://github.com/pytorch/pytorch/pull/,https://github.com/pytorch/pytorch/blob/a44f8894fa6d973693aab44a3dda079a168b05c1/torch/_decomp/decompositions.py#L1475,https://huggingface.co/
+        exclude_patterns: https://dumps.wikimedia.org/,https://github.com/pytorch/pytorch/pull/,https://github.com/pytorch/pytorch/blob/a44f8894fa6d973693aab44a3dda079a168b05c1/torch/_decomp/decompositions.py#L1475,https://huggingface.co/,https://huggingface.co/
         # force_pass : true
 
     - name: urls-checker-docs
@@ -43,5 +43,5 @@ jobs:
         timeout: 2
         retry_count# : 2
         exclude_urls: https://hal.archives-,ouvertes.fr/hal-00990252/document,http://badge.fury.io/py/onnx-diagnostic,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://medium.com/@msouza.os/llm-from-scratch-with-pytorch-9f21808c6319,https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L5965,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311
-        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311,https://codecov.io/
+        exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/,https://azure.microsoft.com/en-us/products/devops/pipelines,https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670,https://github.com/NVIDIA/TransformerEngine.git@6a9edc38bf9b941b7d369af5103fa8fe0b121d61,https://github.com/pytorch/pytorch/blob/main/torch/,https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html,https://badge.fury.io/py/onnx-diagnostic.svg,https://github.com/huggingface/transformers/pull/36311,https://codecov.io/,https://huggingface.co/
         # force_pass : true
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index b461325b..b7abaa86 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -62,9 +62,11 @@ jobs:
 
       - name: tiny-llm example
         run: PYTHONPATH=. python _doc/examples/plot_export_tiny_llm.py
+        continue-on-error: true  # connectivity issues
 
       - name: tiny-llm bypass
         run: PYTHONPATH=. python _doc/examples/plot_export_tiny_llm_patched.py
+        continue-on-error: true  # connectivity issues
 
       - name: run tests bypassed
         run: PYTHONPATH=. python _unittests/ut_torch_models/test_tiny_llms_bypassed.py
diff --git a/_doc/api/helpers/config_helper.rst b/_doc/api/helpers/config_helper.rst
new file mode 100644
index 00000000..6dd9be71
--- /dev/null
+++ b/_doc/api/helpers/config_helper.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.helpers.config_helper
+=====================================
+
+.. automodule:: onnx_diagnostic.helpers.config_helper
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/helpers/index.rst b/_doc/api/helpers/index.rst
index 79703adf..d3224eb6 100644
--- a/_doc/api/helpers/index.rst
+++ b/_doc/api/helpers/index.rst
@@ -9,6 +9,7 @@ onnx_diagnostic.helpers
     args_helper
     bench_run
     cache_helper
+    config_helper
     helper
     memory_peak
     onnx_helper
diff --git a/_doc/api/index.rst b/_doc/api/index.rst
index 2de84cf2..aa5e7b97 100644
--- a/_doc/api/index.rst
+++ b/_doc/api/index.rst
@@ -10,6 +10,7 @@ API of onnx_diagnostic
     export/index
     helpers/index
     reference/index
+    tasks/index
     torch_export_patches/index
     torch_models/index
     torch_onnx/index
diff --git a/_doc/api/tasks/automatic_speech_recognition.rst b/_doc/api/tasks/automatic_speech_recognition.rst
new file mode 100644
index 00000000..5c3f64ea
--- /dev/null
+++ b/_doc/api/tasks/automatic_speech_recognition.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.automatic_speech_recognition
+===================================================
+
+.. automodule:: onnx_diagnostic.tasks.automatic_speech_recognition
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/fill_mask.rst b/_doc/api/tasks/fill_mask.rst
new file mode 100644
index 00000000..58c6402a
--- /dev/null
+++ b/_doc/api/tasks/fill_mask.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.fill_mask
+================================
+
+.. automodule:: onnx_diagnostic.tasks.fill_mask
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/image_classification.rst b/_doc/api/tasks/image_classification.rst
new file mode 100644
index 00000000..3643b2f5
--- /dev/null
+++ b/_doc/api/tasks/image_classification.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.image_classification
+===========================================
+
+.. automodule:: onnx_diagnostic.tasks.image_classification
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/image_text_to_text.rst b/_doc/api/tasks/image_text_to_text.rst
new file mode 100644
index 00000000..abe80bd8
--- /dev/null
+++ b/_doc/api/tasks/image_text_to_text.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.image_text_to_text
+=========================================
+
+.. automodule:: onnx_diagnostic.tasks.image_text_to_text
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/index.rst b/_doc/api/tasks/index.rst
new file mode 100644
index 00000000..ae7e14b3
--- /dev/null
+++ b/_doc/api/tasks/index.rst
@@ -0,0 +1,18 @@
+onnx_diagnostic.tasks
+=====================
+
+.. toctree::
+    :maxdepth: 1
+    :caption: modules
+
+    automatic_speech_recognition
+    fill_mask
+    image_classification
+    image_text_to_text      
+    text_generation
+    text2text_generation
+    zero_shot_image_classification
+    
+.. automodule:: onnx_diagnostic.tasks
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/text2text_generation.rst b/_doc/api/tasks/text2text_generation.rst
new file mode 100644
index 00000000..c148d174
--- /dev/null
+++ b/_doc/api/tasks/text2text_generation.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.text2text_generation
+===========================================
+
+.. automodule:: onnx_diagnostic.tasks.text2text_generation
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/text_generation.rst b/_doc/api/tasks/text_generation.rst
new file mode 100644
index 00000000..3f125381
--- /dev/null
+++ b/_doc/api/tasks/text_generation.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.text_generation
+======================================
+
+.. automodule:: onnx_diagnostic.tasks.text_generation
+    :members:
+    :no-undoc-members:
diff --git a/_doc/api/tasks/zero_shot_image_classification.rst b/_doc/api/tasks/zero_shot_image_classification.rst
new file mode 100644
index 00000000..74d9e619
--- /dev/null
+++ b/_doc/api/tasks/zero_shot_image_classification.rst
@@ -0,0 +1,7 @@
+
+onnx_diagnostic.export.zero_shot_image_classification
+=====================================================
+
+.. automodule:: onnx_diagnostic.tasks.zero_shot_image_classification
+    :members:
+    :no-undoc-members:
diff --git a/_doc/conf.py b/_doc/conf.py
index c1c3e749..0545e77e 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -170,7 +170,9 @@
 }
 
 if int(os.environ.get("UNITTEST_GOING", "0")):
-    sphinx_gallery_conf["ignore_pattern"] = ".*((tiny_llm)|(dort)|(draft_mode)).*"
+    sphinx_gallery_conf["ignore_pattern"] = (
+        ".*((tiny_llm)|(dort)|(draft_mode)|(hub_codellama.py)).*"
+    )
 elif pv.Version(torch.__version__) < pv.Version("2.8"):
     sphinx_gallery_conf["ignore_pattern"] = ".*((_oe_)|(dort)|(draft_mode)).*"
 
diff --git a/_unittests/ut_helpers/test_config_helper.py b/_unittests/ut_helpers/test_config_helper.py
new file mode 100644
index 00000000..1ad7a9ad
--- /dev/null
+++ b/_unittests/ut_helpers/test_config_helper.py
@@ -0,0 +1,20 @@
+import unittest
+import transformers
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    requires_torch,
+    requires_transformers,
+)
+from onnx_diagnostic.helpers.config_helper import config_class_from_architecture
+
+
+class TestConfigHelper(ExtTestCase):
+    @requires_transformers("4.50")  # we limit to some versions of the CI
+    @requires_torch("2.7")
+    def test_config_class_from_architecture(self):
+        config = config_class_from_architecture("LlamaForCausalLM")
+        self.assertEqual(config, transformers.LlamaConfig)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/test_tasks.py b/_unittests/ut_tasks/test_tasks.py
new file mode 100644
index 00000000..e52a0697
--- /dev/null
+++ b/_unittests/ut_tasks/test_tasks.py
@@ -0,0 +1,105 @@
+import unittest
+import torch
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
+from onnx_diagnostic.torch_export_patches import bypass_export_some_errors
+
+
+class TestTasks(ExtTestCase):
+    @hide_stdout()
+    def test_text2text_generation(self):
+        mid = "sshleifer/tiny-marian-en-de"
+        # mid = "Salesforce/codet5-small"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(473928, 118482)])
+        model, inputs = data["model"], data["inputs"]
+        raise unittest.SkipTest(f"not working for {mid!r}")
+        model(**inputs)
+
+    @hide_stdout()
+    def test_automatic_speech_recognition(self):
+        mid = "openai/whisper-tiny"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(132115968, 33028992)])
+        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
+        Dim = torch.export.Dim
+        self.maxDiff = None
+        self.assertIn("{0:Dim(batch),1:Dim(seq_length)}", self.string_type(ds))
+        self.assertEqualAny(
+            {
+                "decoder_input_ids": {
+                    0: Dim("batch", min=1, max=1024),
+                    1: Dim("seq_length", min=1, max=4096),
+                },
+                "cache_position": {0: Dim("seq_length", min=1, max=4096)},
+                "encoder_outputs": [{0: Dim("batch", min=1, max=1024)}],
+                "past_key_values": [
+                    [
+                        [
+                            {0: Dim("batch", min=1, max=1024)},
+                            {0: Dim("batch", min=1, max=1024)},
+                        ],
+                        [
+                            {0: Dim("batch", min=1, max=1024)},
+                            {0: Dim("batch", min=1, max=1024)},
+                        ],
+                    ],
+                    [
+                        [
+                            {0: Dim("batch", min=1, max=1024)},
+                            {0: Dim("batch", min=1, max=1024)},
+                        ],
+                        [
+                            {0: Dim("batch", min=1, max=1024)},
+                            {0: Dim("batch", min=1, max=1024)},
+                        ],
+                    ],
+                ],
+            },
+            ds,
+        )
+        model(**inputs)
+        self.assertEqual(
+            "#1[T1r3]",
+            self.string_type(torch.utils._pytree.tree_flatten(inputs["encoder_outputs"])[0]),
+        )
+        with bypass_export_some_errors(patch_transformers=True, verbose=10):
+            flat = torch.utils._pytree.tree_flatten(inputs["past_key_values"])[0]
+            self.assertIsInstance(flat, list)
+            self.assertIsInstance(flat[0], torch.Tensor)
+            self.assertEqual(
+                "#8[T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4]",
+                self.string_type(flat),
+            )
+            torch.export.export(model, (), kwargs=inputs, dynamic_shapes=ds, strict=False)
+        with bypass_export_some_errors(patch_transformers=True, verbose=10):
+            flat = torch.utils._pytree.tree_flatten(inputs["past_key_values"])[0]
+            self.assertIsInstance(flat, list)
+            self.assertIsInstance(flat[0], torch.Tensor)
+            self.assertEqual(
+                "#8[T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4]",
+                self.string_type(flat),
+            )
+            torch.export.export(model, (), kwargs=inputs, dynamic_shapes=ds, strict=False)
+
+    @hide_stdout()
+    def test_imagetext2text_generation(self):
+        mid = "HuggingFaceM4/tiny-random-idefics"
+        # mid = "Salesforce/codet5-small"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(12742888, 3185722)])
+        model, inputs = data["model"], data["inputs"]
+        model(**inputs)
+
+    @hide_stdout()
+    def test_fill_mask(self):
+        mid = "google-bert/bert-base-multilingual-cased"
+        # mid = "Salesforce/codet5-small"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(428383212, 107095803)])
+        model, inputs = data["model"], data["inputs"]
+        model(**inputs)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
similarity index 86%
rename from _unittests/ut_torch_models/try_tasks.py
rename to _unittests/ut_tasks/try_tasks.py
index d0fdf22e..e05a2161 100644
--- a/_unittests/ut_torch_models/try_tasks.py
+++ b/_unittests/ut_tasks/try_tasks.py
@@ -7,7 +7,7 @@
 class TestHuggingFaceHubModel(ExtTestCase):
     @never_test()
     def test_image_classification(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k image_c
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k image_c
 
         from transformers import ViTImageProcessor, ViTModel
         from PIL import Image
@@ -27,7 +27,7 @@ def test_image_classification(self):
 
     @never_test()
     def test_image_classification_resnet(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k resnet
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k resnet
 
         from transformers import ViTImageProcessor, ViTModel
         from PIL import Image
@@ -47,7 +47,7 @@ def test_image_classification_resnet(self):
 
     @never_test()
     def test_zero_shot_image_classification(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k zero
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k zero
         from PIL import Image
         import requests
         from transformers import CLIPProcessor, CLIPModel
@@ -74,7 +74,7 @@ def test_zero_shot_image_classification(self):
 
     @never_test()
     def test_text2text_generation(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k text2t
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k text2t
 
         import torch
         from transformers import RobertaTokenizer, T5ForConditionalGeneration
@@ -100,7 +100,7 @@ def test_text2text_generation(self):
 
     @never_test()
     def test_imagetext2text_generation(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k etext2t
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t
         # https://huggingface.co/docs/transformers/main/en/tasks/idefics
 
         import torch
@@ -131,7 +131,7 @@ def test_imagetext2text_generation(self):
 
     @never_test()
     def test_automatic_speech_recognition(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k automatic_speech
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k automatic_speech
         # https://huggingface.co/openai/whisper-tiny
 
         from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -195,6 +195,22 @@ def test_automatic_speech_recognition(self):
         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print("--", transcription)
 
+    @never_test()
+    def test_fill_mask(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k fill
+        # https://huggingface.co/google-bert/bert-base-multilingual-cased
+
+        from transformers import BertTokenizer, BertModel
+
+        tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+        model = BertModel.from_pretrained("bert-base-multilingual-cased")
+        text = "Replace me by any text you'd like."
+        encoded_input = tokenizer(text, return_tensors="pt")
+        print()
+        print("-- inputs", string_type(encoded_input, with_shape=True, with_min_max=True))
+        output = model(**encoded_input)
+        print("-- outputs", string_type(output, with_shape=True, with_min_max=True))
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_hghub_api.py b/_unittests/ut_torch_models/test_hghub_api.py
index bb418e95..3be120fb 100644
--- a/_unittests/ut_torch_models/test_hghub_api.py
+++ b/_unittests/ut_torch_models/test_hghub_api.py
@@ -100,7 +100,7 @@ def test_hf_all_models(self):
 
     def test_load_architecture_task(self):
         data = load_architecture_task()
-        print(set(data.values()))
+        self.assertNotEmpty(set(data.values()))
 
     def test_task_from_tags(self):
         _tags = [
@@ -132,10 +132,10 @@ def test_model_testings_and_architectures(self):
 
     def test__ccached_config_64(self):
         from onnx_diagnostic.torch_models.hghub.hub_data_cached_configs import (
-            _cached_hf_internal_testing_tiny_random_beitforimageclassification,
+            _ccached_hf_internal_testing_tiny_random_beitforimageclassification,
         )
 
-        conf = _cached_hf_internal_testing_tiny_random_beitforimageclassification()
+        conf = _ccached_hf_internal_testing_tiny_random_beitforimageclassification()
         self.assertEqual(conf.auxiliary_channels, 256)
 
 
diff --git a/_unittests/ut_torch_models/test_hghub_model.py b/_unittests/ut_torch_models/test_hghub_model.py
index 700948b4..6083812f 100644
--- a/_unittests/ut_torch_models/test_hghub_model.py
+++ b/_unittests/ut_torch_models/test_hghub_model.py
@@ -1,7 +1,5 @@
 import pprint
 import unittest
-import torch
-import transformers
 from onnx_diagnostic.ext_test_case import (
     ExtTestCase,
     hide_stdout,
@@ -9,22 +7,13 @@
     requires_transformers,
     ignore_errors,
 )
-from onnx_diagnostic.torch_models.hghub.model_inputs import (
-    config_class_from_architecture,
-    get_untrained_model_with_inputs,
-)
+from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
 from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config
 from onnx_diagnostic.torch_models.hghub.hub_data import load_models_testing
 from onnx_diagnostic.torch_export_patches import bypass_export_some_errors
 
 
 class TestHuggingFaceHubModel(ExtTestCase):
-    @requires_transformers("4.50")  # we limit to some versions of the CI
-    @requires_torch("2.7")
-    def test_config_class_from_architecture(self):
-        config = config_class_from_architecture("LlamaForCausalLM")
-        self.assertEqual(config, transformers.LlamaConfig)
-
     @hide_stdout()
     def test_get_untrained_model_with_inputs_tiny_llm(self):
         mid = "arnir0/Tiny-LLM"
@@ -107,91 +96,6 @@ def test_get_untrained_model_with_inputs_clip_vit(self):
         # different expected value for different version of transformers
         self.assertIn((data["size"], data["n_weights"]), [(188872708, 47218177)])
 
-    @hide_stdout()
-    def test_get_untrained_model_with_inputs_text2text_generation(self):
-        mid = "sshleifer/tiny-marian-en-de"
-        # mid = "Salesforce/codet5-small"
-        data = get_untrained_model_with_inputs(mid, verbose=1)
-        self.assertIn((data["size"], data["n_weights"]), [(473928, 118482)])
-        model, inputs = data["model"], data["inputs"]
-        raise unittest.SkipTest(f"not working for {mid!r}")
-        model(**inputs)
-
-    @hide_stdout()
-    def test_get_untrained_model_with_inputs_automatic_speech_recognition(self):
-        mid = "openai/whisper-tiny"
-        data = get_untrained_model_with_inputs(mid, verbose=1)
-        self.assertIn((data["size"], data["n_weights"]), [(132115968, 33028992)])
-        model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
-        Dim = torch.export.Dim
-        self.maxDiff = None
-        self.assertIn("{0:Dim(batch),1:Dim(seq_length)}", self.string_type(ds))
-        self.assertEqualAny(
-            {
-                "decoder_input_ids": {
-                    0: Dim("batch", min=1, max=1024),
-                    1: Dim("seq_length", min=1, max=4096),
-                },
-                "cache_position": {0: Dim("seq_length", min=1, max=4096)},
-                "encoder_outputs": [{0: Dim("batch", min=1, max=1024)}],
-                "past_key_values": [
-                    [
-                        [
-                            {0: Dim("batch", min=1, max=1024)},
-                            {0: Dim("batch", min=1, max=1024)},
-                        ],
-                        [
-                            {0: Dim("batch", min=1, max=1024)},
-                            {0: Dim("batch", min=1, max=1024)},
-                        ],
-                    ],
-                    [
-                        [
-                            {0: Dim("batch", min=1, max=1024)},
-                            {0: Dim("batch", min=1, max=1024)},
-                        ],
-                        [
-                            {0: Dim("batch", min=1, max=1024)},
-                            {0: Dim("batch", min=1, max=1024)},
-                        ],
-                    ],
-                ],
-            },
-            ds,
-        )
-        model(**inputs)
-        self.assertEqual(
-            "#1[T1r3]",
-            self.string_type(torch.utils._pytree.tree_flatten(inputs["encoder_outputs"])[0]),
-        )
-        with bypass_export_some_errors(patch_transformers=True, verbose=10):
-            flat = torch.utils._pytree.tree_flatten(inputs["past_key_values"])[0]
-            self.assertIsInstance(flat, list)
-            self.assertIsInstance(flat[0], torch.Tensor)
-            self.assertEqual(
-                "#8[T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4]",
-                self.string_type(flat),
-            )
-            torch.export.export(model, (), kwargs=inputs, dynamic_shapes=ds, strict=False)
-        with bypass_export_some_errors(patch_transformers=True, verbose=10):
-            flat = torch.utils._pytree.tree_flatten(inputs["past_key_values"])[0]
-            self.assertIsInstance(flat, list)
-            self.assertIsInstance(flat[0], torch.Tensor)
-            self.assertEqual(
-                "#8[T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4,T1r4]",
-                self.string_type(flat),
-            )
-            torch.export.export(model, (), kwargs=inputs, dynamic_shapes=ds, strict=False)
-
-    @hide_stdout()
-    def test_get_untrained_model_with_inputs_imagetext2text_generation(self):
-        mid = "HuggingFaceM4/tiny-random-idefics"
-        # mid = "Salesforce/codet5-small"
-        data = get_untrained_model_with_inputs(mid, verbose=1)
-        self.assertIn((data["size"], data["n_weights"]), [(12742888, 3185722)])
-        model, inputs = data["model"], data["inputs"]
-        model(**inputs)
-
     @hide_stdout()
     @requires_torch("2.7", "reduce test time")
     @requires_transformers("4.50", "reduce test time")
@@ -210,11 +114,9 @@ def _diff(c1, c2):
         for mid in load_models_testing():
             with self.subTest(mid=mid):
                 if mid in {
-                    "hf-internal-testing/tiny-random-BeitForImageClassification",
                     "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation",
                     "hf-internal-testing/tiny-random-MoonshineForConditionalGeneration",
                     "fxmarty/pix2struct-tiny-random",
-                    "hf-internal-testing/tiny-random-ViTMSNForImageClassification",
                     "hf-internal-testing/tiny-random-YolosModel",
                 }:
                     print(f"-- not implemented yet for {mid!r}")
diff --git a/_unittests/ut_torch_models/test_test_helpers.py b/_unittests/ut_torch_models/test_test_helpers.py
index 6f846983..a1c99a89 100644
--- a/_unittests/ut_torch_models/test_test_helpers.py
+++ b/_unittests/ut_torch_models/test_test_helpers.py
@@ -9,12 +9,12 @@
     filter_inputs,
     run_ort_fusion,
 )
-from onnx_diagnostic.torch_models.hghub.model_inputs import get_get_inputs_function_for_tasks
+from onnx_diagnostic.tasks import supported_tasks
 
 
 class TestTestHelper(ExtTestCase):
     def test_get_inputs_for_task(self):
-        fcts = get_get_inputs_function_for_tasks()
+        fcts = supported_tasks()
         for task in self.subloop(sorted(fcts)):
             data = get_inputs_for_task(task)
             self.assertIsInstance(data, dict)
diff --git a/_unittests/ut_xrun_doc/test_documentation_examples.py b/_unittests/ut_xrun_doc/test_documentation_examples.py
index 9aedf4b1..77891e73 100644
--- a/_unittests/ut_xrun_doc/test_documentation_examples.py
+++ b/_unittests/ut_xrun_doc/test_documentation_examples.py
@@ -53,9 +53,9 @@ def run_test(self, fold: str, name: str, verbose=0) -> int:
                 if '"dot" not found in path.' in st:
                     # dot not installed, this part
                     # is tested in onnx framework
-                    if verbose:
-                        print(f"failed: {name!r} due to missing dot.")
-                    return 0
+                    raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
+                if "We couldn't connect to 'https://huggingface.co'" in st:
+                    raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
                 raise AssertionError(  # noqa: B904
                     "Example '{}' (cmd: {} - exec_prefix='{}') "
                     "failed due to\n{}"
diff --git a/_unittests/ut_xrun_doc/test_documentation_recipes.py b/_unittests/ut_xrun_doc/test_documentation_recipes.py
index 4ea8c171..d057b744 100644
--- a/_unittests/ut_xrun_doc/test_documentation_recipes.py
+++ b/_unittests/ut_xrun_doc/test_documentation_recipes.py
@@ -52,9 +52,9 @@ def run_test(self, fold: str, name: str, verbose=0) -> int:
                 if '"dot" not found in path.' in st:
                     # dot not installed, this part
                     # is tested in onnx framework
-                    if verbose:
-                        print(f"failed: {name!r} due to missing dot.")
-                    return 0
+                    raise unittest.SkipTest(f"failed: {name!r} due to missing dot.")
+                if "We couldn't connect to 'https://huggingface.co'" in st:
+                    raise unittest.SkipTest(f"Connectivity issues due to\n{err}")
                 raise AssertionError(  # noqa: B904
                     "Example '{}' (cmd: {} - exec_prefix='{}') "
                     "failed due to\n{}"
diff --git a/onnx_diagnostic/_command_lines_parser.py b/onnx_diagnostic/_command_lines_parser.py
index 6d2456d1..b53e196b 100644
--- a/onnx_diagnostic/_command_lines_parser.py
+++ b/onnx_diagnostic/_command_lines_parser.py
@@ -303,13 +303,13 @@ def get_parser_validate() -> ArgumentParser:
 def _cmd_validate(argv: List[Any]):
     from .helpers import string_type
     from .torch_models.test_helper import get_inputs_for_task, validate_model, _ds_clean
-    from .torch_models.hghub.model_inputs import get_get_inputs_function_for_tasks
+    from .tasks import supported_tasks
 
     parser = get_parser_validate()
     args = parser.parse_args(argv[1:])
     if not args.task and not args.mid:
         print("-- list of supported tasks:")
-        print("\n".join(sorted(get_get_inputs_function_for_tasks())))
+        print("\n".join(supported_tasks()))
     elif not args.mid:
         data = get_inputs_for_task(args.task)
         if args.verbose:
diff --git a/onnx_diagnostic/helpers/config_helper.py b/onnx_diagnostic/helpers/config_helper.py
new file mode 100644
index 00000000..38d353b6
--- /dev/null
+++ b/onnx_diagnostic/helpers/config_helper.py
@@ -0,0 +1,80 @@
+import functools
+import importlib
+import inspect
+import re
+from typing import Any, Dict, Optional, Tuple, Union
+import transformers
+
+
+def check_hasattr(config: Any, *args: Union[str, Tuple[Any, ...]]):
+    """
+    Checks the confiugation has all the attributes in ``args``.
+    Raises an exception otherwise.
+    """
+    for a in args:
+        assert isinstance(a, (str, tuple)), f"unexpected type {type(a)} in {args!r}"
+        if isinstance(a, str):
+            assert (isinstance(config, dict) and a in config) or hasattr(
+                config, a
+            ), f"Missing attribute {a!r} in\n{config}"
+        elif isinstance(a, tuple):
+            assert any(
+                (isinstance(name, str) and hasattr(config, name))
+                or all(hasattr(config, _) for _ in name)
+                for name in a
+            ), f"All attributes in {a!r} are missing from\n{config}"
+
+
+def update_config(config: Any, mkwargs: Dict[str, Any]):
+    """Updates a configuration with different values."""
+    for k, v in mkwargs.items():
+        if isinstance(v, dict):
+            assert hasattr(
+                config, k
+            ), f"missing attribute {k!r} in config={config}, cannot update it with {v}"
+            update_config(getattr(config, k), v)
+        else:
+            setattr(config, k, v)
+
+
+def _pick(config, *atts):
+    """Returns the first value found in the configuration."""
+    for a in atts:
+        if isinstance(a, str):
+            if hasattr(config, a):
+                return getattr(config, a)
+        elif isinstance(a, tuple):
+            if all(hasattr(config, _) for _ in a[1:]):
+                return a[0]([getattr(config, _) for _ in a[1:]])
+    raise AssertionError(f"Unable to find any of these {atts!r} in {config}")
+
+
+@functools.cache
+def config_class_from_architecture(arch: str, exc: bool = False) -> Optional[type]:
+    """
+    Retrieves the configuration class for a given architecture.
+
+    :param arch: architecture (clas name)
+    :param exc: raise an exception if not found
+    :return: type
+    """
+    cls = getattr(transformers, arch)
+    mod_name = cls.__module__
+    mod = importlib.import_module(mod_name)
+    source = inspect.getsource(mod)
+    reg = re.compile("config: ([A-Za-z0-9]+)")
+    fall = reg.findall(source)
+    if len(fall) == 0:
+        assert not exc, (
+            f"Unable to guess Configuration class name for arch={arch!r}, "
+            f"module={mod_name!r}, no candidate, source is\n{source}"
+        )
+        return None
+    unique = set(fall)
+    assert len(unique) == 1, (
+        f"Unable to guess Configuration class name for arch={arch!r}, "
+        f"module={mod_name!r}, found={unique} (#{len(unique)}), "
+        f"source is\n{source}"
+    )
+    cls_name = unique.pop()
+    return getattr(transformers, cls_name)
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
new file mode 100644
index 00000000..b2d01d36
--- /dev/null
+++ b/onnx_diagnostic/tasks/__init__.py
@@ -0,0 +1,44 @@
+from typing import Any, Callable, Dict, List, Tuple
+from . import (
+    automatic_speech_recognition,
+    fill_mask,
+    image_classification,
+    image_text_to_text,
+    text_generation,
+    text2text_generation,
+    zero_shot_image_classification,
+)
+
+__TASKS__ = [
+    automatic_speech_recognition,
+    fill_mask,
+    image_classification,
+    image_text_to_text,
+    text_generation,
+    text2text_generation,
+    zero_shot_image_classification,
+]
+
+
+def supported_tasks() -> List[str]:
+    "Returns the list of supported tasks."
+    return sorted(mod.__TASK__ for mod in __TASKS__)
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    tasks = {mod.__TASK__: mod.reduce_model_config for mod in __TASKS__}
+    assert task in tasks, f"Task {task!r} not found in {sorted(tasks)}"
+    return tasks[task](config, task)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+    If the configuration is None, the function selects typical dimensions.
+    It returns parameters and a function. The function creates dummy inputs
+    if it receives the parameters returned as a first result.
+    """
+    tasks = {mod.__TASK__: mod.random_input_kwargs for mod in __TASKS__}
+    assert task in tasks, f"Task {task!r} not found in {sorted(tasks)}"
+    return tasks[task](config, task)
diff --git a/onnx_diagnostic/tasks/automatic_speech_recognition.py b/onnx_diagnostic/tasks/automatic_speech_recognition.py
new file mode 100644
index 00000000..e9ab82bc
--- /dev/null
+++ b/onnx_diagnostic/tasks/automatic_speech_recognition.py
@@ -0,0 +1,165 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+import transformers
+from ..helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "automatic-speech-recognition"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    kwargs: Dict[str, Any] = {}
+    if hasattr(config, "num_decoder_layers"):
+        config.num_decoder_layers = min(config.num_decoder_layers, 2)
+    if hasattr(config, "decoder_layers"):
+        config.decoder_layers = min(config.decoder_layers, 2)
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    max_source_positions: int,
+    d_model: int,
+    num_hidden_layers: int,
+    encoder_attention_heads: int,
+    encoder_layers: int,
+    decoder_layers: int,
+    head_dim: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``text2text-generation``.
+    Example:
+
+    ::
+
+        dict(
+            cache_position:T7s4,
+            past_key_values:EncoderDecoderCache(
+                self_attention_cache=DynamicCache[serialized](#2[#0[],#0[]]),
+                cross_attention_cache=DynamicCache[serialized](#2[#0[],#0[]])
+            ),
+            decoder_input_ids:T7s1x4,
+            encoder_outputs:BaseModelOutput(last_hidden_state:T1s1x1500x384),
+            use_cache:bool,return_dict:bool
+        )
+        dict(
+            cache_position:T7s1,
+            past_key_values:EncoderDecoderCache(
+                self_attention_cache=DynamicCache[serialized](#2[
+                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64],
+                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64]
+                ]),
+                cross_attention_cache=DynamicCache[serialized](#2[
+                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64],
+                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64]
+                ]),
+            ),
+            decoder_input_ids:T7s1x1,
+            encoder_outputs:BaseModelOutput(last_hidden_state:T1s1x1500x384),
+            use_cache:bool,return_dict:bool
+        )
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+
+    shapes = {
+        "decoder_input_ids": {0: batch, 1: seq_length},
+        "cache_position": {0: seq_length},
+        "encoder_outputs": [{0: batch}],
+        "past_key_values": [
+            [
+                [{0: batch} for _ in range(num_hidden_layers)],
+                [{0: batch} for _ in range(num_hidden_layers)],
+            ],
+            [
+                [{0: batch} for _ in range(num_hidden_layers)],
+                [{0: batch} for _ in range(num_hidden_layers)],
+            ],
+        ],
+    }
+    inputs = dict(
+        decoder_input_ids=torch.randint(
+            0, dummy_max_token_id, (batch_size, sequence_length)
+        ).to(torch.int64),
+        cache_position=(torch.arange(sequence_length) + 5).to(torch.int64),
+        encoder_outputs=transformers.modeling_outputs.BaseModelOutput(
+            last_hidden_state=torch.randn(batch_size, max_source_positions, d_model)
+        ),
+        past_key_values=make_encoder_decoder_cache(
+            make_dynamic_cache(
+                [
+                    (
+                        torch.randn(
+                            batch_size, encoder_attention_heads, encoder_layers, head_dim
+                        ),
+                        torch.randn(
+                            batch_size, encoder_attention_heads, encoder_layers, head_dim
+                        ),
+                    )
+                    for i in range(num_hidden_layers)
+                ]
+            ),
+            make_dynamic_cache(
+                [
+                    (
+                        torch.randn(
+                            batch_size, encoder_attention_heads, max_source_positions, head_dim
+                        ),
+                        torch.randn(
+                            batch_size, encoder_attention_heads, max_source_positions, head_dim
+                        ),
+                    )
+                    for i in range(num_hidden_layers)
+                ]
+            ),
+        ),
+        # one these is selected based on the forward method signature
+        # encoder_last_hidden_state=torch.randn(batch_size, sequence_length2, encoder_dim),
+        # encoder_outputs=torch.randn(batch_size, sequence_length2, encoder_dim),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(
+            config,
+            "d_model",
+            "decoder_attention_heads",
+            "decoder_layers",
+            "encoder_attention_heads",
+            "encoder_layers",
+            "max_source_positions",
+            "num_hidden_layers",
+            "vocab_size",
+        )
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        dummy_max_token_id=31000 if config is None else config.vocab_size,
+        max_source_positions=1500 if config is None else config.max_source_positions,
+        d_model=384 if config is None else config.d_model,
+        num_hidden_layers=4 if config is None else config.num_hidden_layers,
+        encoder_attention_heads=6 if config is None else config.encoder_attention_heads,
+        encoder_layers=4 if config is None else config.encoder_layers,
+        decoder_attention_heads=6 if config is None else config.decoder_attention_heads,
+        decoder_layers=4 if config is None else config.decoder_layers,
+        head_dim=(
+            64 if config is None else (config.d_model // config.encoder_attention_heads)
+        ),
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/fill_mask.py b/onnx_diagnostic/tasks/fill_mask.py
new file mode 100644
index 00000000..ef427b91
--- /dev/null
+++ b/onnx_diagnostic/tasks/fill_mask.py
@@ -0,0 +1,67 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "fill-mask"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, "num_attention_heads", "num_hidden_layers")
+    kwargs = dict(
+        num_hidden_layers=min(config.num_hidden_layers, 2),
+        num_attention_heads=min(config.num_attention_heads, 4),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    batch_size: int,
+    sequence_length: int,
+    dummy_max_token_id: int,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``fill-mask``.
+    Example:
+
+    ::
+
+        input_ids:T7s1x13[101,72654:A16789.23076923077],
+        token_type_ids:T7s1x13[0,0:A0.0],
+        attention_mask:T7s1x13[1,1:A1.0])
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("sequence_length", min=1, max=1024)
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "token_type_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: seq_length},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
+            torch.int64
+        ),
+        token_type_ids=torch.zeros((batch_size, sequence_length)).to(torch.int64),
+        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "vocab_size")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/image_classification.py b/onnx_diagnostic/tasks/image_classification.py
new file mode 100644
index 00000000..4b0c9757
--- /dev/null
+++ b/onnx_diagnostic/tasks/image_classification.py
@@ -0,0 +1,96 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "image-classification"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, ("num_hidden_layers", "hidden_sizes"))
+    kwargs = dict(
+        num_hidden_layers=(
+            min(config.num_hidden_layers, 2)
+            if hasattr(config, "num_hidden_layers")
+            else len(config.hidden_sizes)
+        )
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    input_width: int,
+    input_height: int,
+    input_channels: int,
+    batch_size: int = 2,
+    dynamic_rope: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``image-classification``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param batch_size: batch size
+    :param input_channels: input channel
+    :param input_width: input width
+    :param input_height: input height
+    :return: dictionary
+    """
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_width {type(input_width)}{config}"
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_height {type(input_height)}{config}"
+
+    shapes = {
+        "pixel_values": {
+            0: torch.export.Dim("batch", min=1, max=1024),
+            2: torch.export.Dim("width", min=1, max=4096),
+            3: torch.export.Dim("height", min=1, max=4096),
+        },
+    }
+    inputs = dict(
+        pixel_values=torch.randn(batch_size, input_channels, input_width, input_height).clamp(
+            -1, 1
+        ),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, ("image_size", "architectures"), "num_channels")
+    if config is not None:
+        if hasattr(config, "image_size"):
+            image_size = config.image_size
+        else:
+            assert config.architectures, f"empty architecture in {config}"
+            from ..torch_models.hghub.hub_api import get_architecture_default_values
+
+            default_values = get_architecture_default_values(config.architectures[0])
+            image_size = default_values["image_size"]
+    if config is None or isinstance(image_size, int):
+        kwargs = dict(
+            batch_size=2,
+            input_width=224 if config is None else image_size,
+            input_height=224 if config is None else image_size,
+            input_channels=3 if config is None else config.num_channels,
+        )
+    else:
+        kwargs = dict(
+            batch_size=2,
+            input_width=config.image_size[0],
+            input_height=config.image_size[1],
+            input_channels=config.num_channels,
+        )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/image_text_to_text.py b/onnx_diagnostic/tasks/image_text_to_text.py
new file mode 100644
index 00000000..60d7cfd0
--- /dev/null
+++ b/onnx_diagnostic/tasks/image_text_to_text.py
@@ -0,0 +1,145 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.cache_helper import make_dynamic_cache
+from ..helpers.config_helper import update_config, check_hasattr, _pick
+
+__TASK__ = "image-text-to-text"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    kwargs: Dict[str, Any] = {}
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+    if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
+        config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    head_dim: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    sequence_length2: int = 3,
+    n_images: int = 2,
+    dynamic_rope: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``text-generation``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param head_dim: last dimension of the cache
+    :param dummy_max_token_id: dummy max token id
+    :param batch_size: batch size
+    :param sequence_length: sequence length
+    :param sequence_length2: new sequence length
+    :param n_images: number of images
+    :param width: width of the image
+    :param height: height of the image
+    :param num_channels: number of channels
+    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
+    :return: dictionary
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
+    images = torch.export.Dim("images", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "attention_mask": {
+            0: batch,
+            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
+        },
+        "position_ids": {
+            0: batch,
+            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
+        },
+        "past_key_values": [
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+        ],
+        "pixel_values": {0: batch, 1: images},
+        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
+            torch.int64
+        ),
+        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
+        .to(torch.int64)
+        .expand((batch_size, -1)),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+        image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+            torch.int64
+        ),
+        pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
+            torch.int64
+        ),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(
+            config,
+            "vocab_size",
+            "hidden_size",
+            "num_attention_heads",
+            ("num_key_value_heads", "num_attention_heads"),
+            "intermediate_size",
+            "hidden_size",
+            "vision_config",
+        )
+        check_hasattr(config.vision_config, "image_size", "num_channels")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        sequence_length2=3,
+        head_dim=(
+            16
+            if config is None
+            else getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        ),
+        dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
+        num_hidden_layers=4 if config is None else config.num_hidden_layers,
+        num_key_value_heads=(
+            8
+            if config is None
+            else _pick(config, "num_key_value_heads", "num_attention_heads")
+        ),
+        intermediate_size=1024 if config is None else config.intermediate_size,
+        hidden_size=512 if config is None else config.hidden_size,
+        width=224 if config is None else config.vision_config.image_size,
+        height=224 if config is None else config.vision_config.image_size,
+        num_channels=3 if config is None else config.vision_config.num_channels,
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/text2text_generation.py b/onnx_diagnostic/tasks/text2text_generation.py
new file mode 100644
index 00000000..abce3714
--- /dev/null
+++ b/onnx_diagnostic/tasks/text2text_generation.py
@@ -0,0 +1,172 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
+from ..helpers.config_helper import update_config, check_hasattr, _pick
+
+__TASK__ = "text2text-generation"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    kwargs: Dict[str, Any] = {}
+    if hasattr(config, "num_decoder_layers"):
+        config.num_decoder_layers = min(config.num_decoder_layers, 2)
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    head_dim: int,
+    encoder_dim: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    sequence_length2: int = 3,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``text2text-generation``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param head_dim: last dimension of the cache
+    :param dummy_max_token_id: dummy max token id
+    :param batch_size: batch size
+    :param encoder_dim: last dimension of encoder_last_hidden_state
+    :param sequence_length: sequence length
+    :param sequence_length2: new sequence length
+    :return: dictionary
+
+    Stolen inputs for one model.
+
+    ::
+
+        cache_position:T7s1
+        past_key_values:EncoderDecoderCache(
+            self_attention_cache=DynamicCache(
+                key_cache=#6[T1s1x8x1x64,...],
+                value_cache=#6[T1s1x8x1x64,...]),
+            cross_attention_cache=DynamicCache(
+                key_cache=#6[T1s1x8x16x64,...],
+                value_cache=#6[T1s1x8x16x64,...])),
+        decoder_input_ids:T7s1x1,
+        encoder_outputs:dict(last_hidden_state:T1s1x16x512)
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
+    cache_length2 = torch.export.Dim("cache_length2", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "decoder_input_ids": {0: batch, 1: torch.export.Dim.DYNAMIC},
+        "attention_mask": {0: batch, 1: torch.export.Dim.DYNAMIC},
+        # "cache_position": {0: batch, 1: torch.export.Dim.DYNAMIC},
+        "past_key_values": [
+            [
+                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            ],
+            [
+                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
+                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
+            ],
+        ],
+        # one these is selected based on the forward method signature
+        # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},
+        # "encoder_outputs": {0: batch, 1: torch.export.Dim.DYNAMIC},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
+            torch.int64
+        ),
+        decoder_input_ids=torch.randint(
+            0, dummy_max_token_id, (batch_size, sequence_length2)
+        ).to(torch.int64),
+        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
+        # cache_position=torch.arange(sequence_length, sequence_length + sequence_length2)
+        # .to(torch.int64)
+        # .expand((batch_size, -1)),
+        past_key_values=make_encoder_decoder_cache(
+            make_dynamic_cache(
+                [
+                    (
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length, head_dim
+                        ),
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length, head_dim
+                        ),
+                    )
+                    for i in range(num_hidden_layers)
+                ]
+            ),
+            make_dynamic_cache(
+                [
+                    (
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length2, head_dim
+                        ),
+                        torch.randn(
+                            batch_size, num_key_value_heads, sequence_length2, head_dim
+                        ),
+                    )
+                    for i in range(num_hidden_layers)
+                ]
+            ),
+        ),
+        # one these is selected based on the forward method signature
+        # encoder_last_hidden_state=torch.randn(batch_size, sequence_length2, encoder_dim),
+        # encoder_outputs=torch.randn(batch_size, sequence_length2, encoder_dim),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(
+            config,
+            "vocab_size",
+            "hidden_size",
+            "num_attention_heads",
+            ("num_hidden_layers", "num_layers"),
+            ("n_positions", "d_model"),
+            (
+                "num_key_value_heads",
+                "num_heads",
+                ("decoder_attention_heads", "encoder_attention_heads"),
+            ),
+        )
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        sequence_length2=3,
+        head_dim=16 if config is None else (config.d_kv if hasattr(config, "d_kv") else 1),
+        dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
+        num_hidden_layers=(
+            8 if config is None else _pick(config, "num_hidden_layers", "num_layers")
+        ),
+        num_key_value_heads=(
+            16
+            if config is None
+            else _pick(
+                config,
+                "num_key_value_heads",
+                "num_heads",
+                (sum, "encoder_attention_heads", "decoder_attention_heads"),
+            )
+        ),
+        encoder_dim=512 if config is None else _pick(config, "n_positions", "d_model"),
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
new file mode 100644
index 00000000..7cb249a5
--- /dev/null
+++ b/onnx_diagnostic/tasks/text_generation.py
@@ -0,0 +1,148 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.cache_helper import make_dynamic_cache
+from ..helpers.config_helper import update_config, check_hasattr, _pick
+
+__TASK__ = "text-generation"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(
+        config,
+        ("head_dim", ("hidden_size", "num_attention_heads")),
+        "num_hidden_layers",
+        ("num_key_value_heads", "num_attention_heads"),
+        "intermediate_size",
+        "hidden_size",
+    )
+    kwargs = dict(
+        head_dim=getattr(config, "head_dim", config.hidden_size // config.num_attention_heads),
+        num_hidden_layers=min(config.num_hidden_layers, 2),
+        num_key_value_heads=(
+            config.num_key_value_heads
+            if hasattr(config, "num_key_value_heads")
+            else config.num_attention_heads
+        ),
+        intermediate_size=(
+            min(config.intermediate_size, 24576 // 4)
+            if config.intermediate_size % 4 == 0
+            else config.intermediate_size
+        ),
+        hidden_size=(
+            min(config.hidden_size, 3072 // 4)
+            if config.hidden_size % 4 == 0
+            else config.hidden_size
+        ),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    head_dim: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    sequence_length2: int = 3,
+    dynamic_rope: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``text-generation``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param head_dim: last dimension of the cache
+    :param dummy_max_token_id: dummy max token id
+    :param batch_size: batch size
+    :param sequence_length: sequence length
+    :param sequence_length2: new sequence length
+    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
+    :return: dictionary
+    """
+    if head_dim is None:
+        assert config, "head_dim is None, the value cannot be set without a configuration"
+        head_dim = config.hidden_size // config.num_attention_heads
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "attention_mask": {
+            0: batch,
+            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
+        },
+        "position_ids": {
+            0: batch,
+            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
+        },
+        "past_key_values": [
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+        ],
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
+            torch.int64
+        ),
+        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
+        .to(torch.int64)
+        .expand((batch_size, -1)),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(
+            config,
+            "vocab_size",
+            "hidden_size",
+            "num_attention_heads",
+            ("num_key_value_heads", "num_attention_heads"),
+            "intermediate_size",
+            "hidden_size",
+        )
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        sequence_length2=3,
+        head_dim=(
+            16
+            if config is None
+            else getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        ),
+        dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
+        num_hidden_layers=4 if config is None else config.num_hidden_layers,
+        num_key_value_heads=(
+            24
+            if config is None
+            else _pick(config, "num_key_value_heads", "num_attention_heads")
+        ),
+        intermediate_size=1024 if config is None else config.intermediate_size,
+        hidden_size=512 if config is None else config.hidden_size,
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/tasks/zero_shot_image_classification.py b/onnx_diagnostic/tasks/zero_shot_image_classification.py
new file mode 100644
index 00000000..87f774e5
--- /dev/null
+++ b/onnx_diagnostic/tasks/zero_shot_image_classification.py
@@ -0,0 +1,106 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.config_helper import update_config, check_hasattr
+
+__TASK__ = "zero-shot-image-classification"
+
+
+def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
+    """Reduces a model size."""
+    check_hasattr(config, "vision_config", "text_config")
+    check_hasattr(config.vision_config, "num_hidden_layers", "num_attention_heads")
+    check_hasattr(config.text_config, "num_hidden_layers", "num_attention_heads")
+    kwargs = dict(
+        vision_config=dict(
+            num_hidden_layers=min(2, config.vision_config.num_hidden_layers),
+            num_attention_heads=min(2, config.vision_config.num_attention_heads),
+        ),
+        text_config=dict(
+            num_hidden_layers=min(2, config.text_config.num_hidden_layers),
+            num_attention_heads=min(2, config.text_config.num_attention_heads),
+        ),
+    )
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    input_width: int = 224,
+    input_height: int = 224,
+    input_channels: int = 3,
+    batch_size_image=3,
+    **kwargs,  # unused
+):
+    """
+    Generates inputs for task ``zero-short-image-classification``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param dummy_max_token_id: vocabulary size
+    :param batch_size: batch size
+    :param sequence_length: sequence length
+    :param batch_size_image: number of images
+    :param input_channels: input channel
+    :param input_width: input width
+    :param input_height: input height
+    :return: dictionary
+
+    # input_ids:T7s2x7
+    # attention_mask:T7s2x7
+    # pixel_values:T1s2x3x224x224
+    """
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_width {type(input_width)}{config}"
+    assert isinstance(
+        input_width, int
+    ), f"Unexpected type for input_height {type(input_height)}{config}"
+
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    shapes = {
+        "inputs_ids": {0: batch, 1: seq_length},
+        "attention_mask": {0: batch, 1: seq_length},
+        "pixel_values": {
+            0: torch.export.Dim("batch_img", min=1, max=1024),
+            # 2: torch.export.Dim("width", min=1, max=4096),
+            # 3: torch.export.Dim("height", min=1, max=4096),
+        },
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
+        pixel_values=torch.randn(
+            batch_size_image, input_channels, input_width, input_height
+        ).clamp(-1, 1),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(config, "vision_config", "text_config")
+        check_hasattr(config.vision_config, "image_size", "num_channels")
+        check_hasattr(config.text_config, "vocab_size")
+    kwargs = dict(
+        batch_size=2,
+        batch_size_image=3,
+        sequence_length=30,
+        dummy_max_token_id=(49408 if config is None else (config.text_config.vocab_size - 1)),
+        input_width=224 if config is None else config.vision_config.image_size,
+        input_height=224 if config is None else config.vision_config.image_size,
+        input_channels=3 if config is None else config.vision_config.num_channels,
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
index 6acda29d..4da7f968 100644
--- a/onnx_diagnostic/torch_models/hghub/hub_data.py
+++ b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -13,6 +13,7 @@
     ASTModel,feature-extraction
     AlbertModel,feature-extraction
     BeitForImageClassification,image-classification
+    BertForMaskedLM,fill-mask
     BigBirdModel,feature-extraction
     BlenderbotModel,feature-extraction
     BloomModel,feature-extraction
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
index 3949c1c3..89786821 100644
--- a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
+++ b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -34,13 +34,13 @@
             "Config {",
             "Config (**{",
         )
-        rows = [f"def _cached_{name}():", f'    "{c}"', f"    return transformers.{sconf})"]
+        rows = [f"def _ccached_{name}():", f'    "{c}"', f"    return transformers.{sconf})"]
         srows = "\\n".join(rows)
         if len(srows) < 2048:
             print(srows)
         else:
             rows = [
-                f"def _cached_{name}():",
+                f"def _ccached_{name}():",
                 f'    "{c}"',
                 f'    t64 = textwrap.dedent(\"\"\"',
                 *w64,
@@ -130,7 +130,7 @@ def _ccached_microsoft_phi2():
     )
 
 
-def _cached_hf_internal_testing_tiny_random_beitforimageclassification():
+def _ccached_hf_internal_testing_tiny_random_beitforimageclassification():
     "hf-internal-testing/tiny-random-BeitForImageClassification"
     return transformers.BeitConfig(
         **{
@@ -3439,3 +3439,32 @@ def _ccached_openai_clip_vit_base_patch16():
             },
         }
     )
+
+
+def _ccached_google_bert_bert_base_multilingual_cased():
+    "google-bert/bert-base-multilingual-cased"
+    return transformers.BertConfig(
+        **{
+            "architectures": ["BertForMaskedLM"],
+            "attention_probs_dropout_prob": 0.1,
+            "directionality": "bidi",
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "intermediate_size": 3072,
+            "layer_norm_eps": 1e-12,
+            "max_position_embeddings": 512,
+            "model_type": "bert",
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "pad_token_id": 0,
+            "pooler_fc_size": 768,
+            "pooler_num_attention_heads": 12,
+            "pooler_num_fc_layers": 3,
+            "pooler_size_per_head": 128,
+            "pooler_type": "first_token_transform",
+            "type_vocab_size": 2,
+            "vocab_size": 119547,
+        }
+    )
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py
index 969b4e95..0625b0f6 100644
--- a/onnx_diagnostic/torch_models/hghub/model_inputs.py
+++ b/onnx_diagnostic/torch_models/hghub/model_inputs.py
@@ -1,370 +1,10 @@
-import functools
-import importlib
 import inspect
-import re
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple
 import torch
 import transformers
-from ...helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
-from .hub_api import task_from_arch, get_pretrained_config, get_architecture_default_values
-
-
-@functools.cache
-def config_class_from_architecture(arch: str, exc: bool = False) -> Optional[type]:
-    """
-    Retrieves the configuration class for a given architecture.
-
-    :param arch: architecture (clas name)
-    :param exc: raise an exception if not found
-    :return: type
-    """
-    cls = getattr(transformers, arch)
-    mod_name = cls.__module__
-    mod = importlib.import_module(mod_name)
-    source = inspect.getsource(mod)
-    reg = re.compile("config: ([A-Za-z0-9]+)")
-    fall = reg.findall(source)
-    if len(fall) == 0:
-        assert not exc, (
-            f"Unable to guess Configuration class name for arch={arch!r}, "
-            f"module={mod_name!r}, no candidate, source is\n{source}"
-        )
-        return None
-    unique = set(fall)
-    assert len(unique) == 1, (
-        f"Unable to guess Configuration class name for arch={arch!r}, "
-        f"module={mod_name!r}, found={unique} (#{len(unique)}), "
-        f"source is\n{source}"
-    )
-    cls_name = unique.pop()
-    return getattr(transformers, cls_name)
-
-
-def _update_config(config: Any, kwargs: Dict[str, Any]):
-    for k, v in kwargs.items():
-        if hasattr(config, k):
-            setattr(config, k, v)
-
-
-def reduce_model_config(config: Any, task: str) -> Dict[str, Any]:
-    """Reduces a model size."""
-    if task == "text-generation":
-        check_hasattr(
-            config,
-            ("head_dim", ("hidden_size", "num_attention_heads")),
-            "num_hidden_layers",
-            ("num_key_value_heads", "num_attention_heads"),
-            "intermediate_size",
-            "hidden_size",
-        )
-        kwargs = dict(
-            head_dim=getattr(
-                config, "head_dim", config.hidden_size // config.num_attention_heads
-            ),
-            num_hidden_layers=min(config.num_hidden_layers, 2),
-            num_key_value_heads=(
-                config.num_key_value_heads
-                if hasattr(config, "num_key_value_heads")
-                else config.num_attention_heads
-            ),
-            intermediate_size=(
-                min(config.intermediate_size, 24576 // 4)
-                if config.intermediate_size % 4 == 0
-                else config.intermediate_size
-            ),
-            hidden_size=(
-                min(config.hidden_size, 3072 // 4)
-                if config.hidden_size % 4 == 0
-                else config.hidden_size
-            ),
-        )
-    elif task == "image-classification":
-        check_hasattr(config, ("num_hidden_layers", "hidden_sizes"))
-        kwargs = dict(
-            num_hidden_layers=(
-                min(config.num_hidden_layers, 2)
-                if hasattr(config, "num_hidden_layers")
-                else len(config.hidden_sizes)
-            )
-        )
-    elif task == "zero-shot-image-classification":
-        check_hasattr(config, "vision_config", "text_config")
-        check_hasattr(config.vision_config, "num_hidden_layers", "num_attention_heads")
-        check_hasattr(config.text_config, "num_hidden_layers", "num_attention_heads")
-        kwargs = dict(
-            vision_config=dict(
-                num_hidden_layers=min(2, config.vision_config.num_hidden_layers),
-                num_attention_heads=min(2, config.vision_config.num_attention_heads),
-            ),
-            text_config=dict(
-                num_hidden_layers=min(2, config.text_config.num_hidden_layers),
-                num_attention_heads=min(2, config.text_config.num_attention_heads),
-            ),
-        )
-    elif task == "text2text-generation":
-        kwargs = {}
-        if hasattr(config, "num_decoder_layers"):
-            config.num_decoder_layers = min(config.num_decoder_layers, 2)
-        if hasattr(config, "num_hidden_layers"):
-            config.num_hidden_layers = min(config.num_hidden_layers, 2)
-    elif task == "image-text-to-text":
-        kwargs = {}
-        if hasattr(config, "num_hidden_layers"):
-            config.num_hidden_layers = min(config.num_hidden_layers, 2)
-        if hasattr(config, "vision_config") and hasattr(
-            config.vision_config, "num_hidden_layers"
-        ):
-            config.vision_config.num_hidden_layers = min(
-                config.vision_config.num_hidden_layers, 2
-            )
-    elif task == "automatic-speech-recognition":
-        kwargs = {}
-        if hasattr(config, "num_decoder_layers"):
-            config.num_decoder_layers = min(config.num_decoder_layers, 2)
-        if hasattr(config, "decoder_layers"):
-            config.decoder_layers = min(config.decoder_layers, 2)
-        if hasattr(config, "num_hidden_layers"):
-            config.num_hidden_layers = min(config.num_hidden_layers, 2)
-    else:
-        raise NotImplementedError(f"Input generation for task {task!r} not implemented yet.")
-
-    update_config(config, kwargs)
-    return kwargs
-
-
-def update_config(config: Any, mkwargs: Dict[str, Any]):
-    """Updates a configuration with different values."""
-    for k, v in mkwargs.items():
-        if isinstance(v, dict):
-            assert hasattr(
-                config, k
-            ), f"missing attribute {k!r} in config={config}, cannot update it with {v}"
-            update_config(getattr(config, k), v)
-        else:
-            setattr(config, k, v)
-
-
-def check_hasattr(config: Any, *args: Union[str, Tuple[Any, ...]]):
-    """
-    Checks the confiugation has all the attributes in ``args``.
-    Raises an exception otherwise.
-    """
-    for a in args:
-        assert isinstance(a, (str, tuple)), f"unexpected type {type(a)} in {args!r}"
-        if isinstance(a, str):
-            assert (isinstance(config, dict) and a in config) or hasattr(
-                config, a
-            ), f"Missing attribute {a!r} in\n{config}"
-        elif isinstance(a, tuple):
-            assert any(
-                (isinstance(name, str) and hasattr(config, name))
-                or all(hasattr(config, _) for _ in name)
-                for name in a
-            ), f"All attributes in {a!r} are missing from\n{config}"
-
-
-def _pick(config, *atts):
-    """Returns the first value found in the configuration."""
-    for a in atts:
-        if isinstance(a, str):
-            if hasattr(config, a):
-                return getattr(config, a)
-        elif isinstance(a, tuple):
-            if all(hasattr(config, _) for _ in a[1:]):
-                return a[0]([getattr(config, _) for _ in a[1:]])
-    raise AssertionError(f"Unable to find any of these {atts!r} in {config}")
-
-
-def random_input_kwargs(config: Any, task: str) -> Tuple[Dict[str, Any], Callable]:
-    """
-    Inputs kwargs.
-
-    If the configuration is None, the function selects typical dimensions.
-    """
-    fcts = get_get_inputs_function_for_tasks()
-    assert task in fcts, f"Unsupported task {task!r}, supported are {sorted(fcts)}"
-    if task == "text-generation":
-        if config is not None:
-            check_hasattr(
-                config,
-                "vocab_size",
-                "hidden_size",
-                "num_attention_heads",
-                ("num_key_value_heads", "num_attention_heads"),
-                "intermediate_size",
-                "hidden_size",
-            )
-        kwargs = dict(
-            batch_size=2,
-            sequence_length=30,
-            sequence_length2=3,
-            head_dim=(
-                16
-                if config is None
-                else getattr(
-                    config, "head_dim", config.hidden_size // config.num_attention_heads
-                )
-            ),
-            dummy_max_token_id=31999 if config is None else (config.vocab_size - 1),
-            num_hidden_layers=4 if config is None else config.num_hidden_layers,
-            num_key_value_heads=(
-                24
-                if config is None
-                else _pick(config, "num_key_value_heads", "num_attention_heads")
-            ),
-            intermediate_size=1024 if config is None else config.intermediate_size,
-            hidden_size=512 if config is None else config.hidden_size,
-        )
-        fct = get_inputs_for_text_generation
-    elif task == "text2text-generation":
-        if config is not None:
-            check_hasattr(
-                config,
-                "vocab_size",
-                "hidden_size",
-                "num_attention_heads",
-                ("num_hidden_layers", "num_layers"),
-                ("n_positions", "d_model"),
-                (
-                    "num_key_value_heads",
-                    "num_heads",
-                    ("decoder_attention_heads", "encoder_attention_heads"),
-                ),
-            )
-        kwargs = dict(
-            batch_size=2,
-            sequence_length=30,
-            sequence_length2=3,
-            head_dim=16 if config is None else (config.d_kv if hasattr(config, "d_kv") else 1),
-            dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
-            num_hidden_layers=(
-                8 if config is None else _pick(config, "num_hidden_layers", "num_layers")
-            ),
-            num_key_value_heads=(
-                16
-                if config is None
-                else _pick(
-                    config,
-                    "num_key_value_heads",
-                    "num_heads",
-                    (sum, "encoder_attention_heads", "decoder_attention_heads"),
-                )
-            ),
-            encoder_dim=512 if config is None else _pick(config, "n_positions", "d_model"),
-        )
-        fct = get_inputs_for_text2text_generation  # type: ignore
-    elif task == "image-classification":
-        if config is not None:
-            check_hasattr(config, ("image_size", "architectures"), "num_channels")
-        if config is not None:
-            if hasattr(config, "image_size"):
-                image_size = config.image_size
-            else:
-                assert config.architectures, f"empty architecture in {config}"
-                default_values = get_architecture_default_values(config.architectures[0])
-                image_size = default_values["image_size"]
-        if config is None or isinstance(image_size, int):
-            kwargs = dict(
-                batch_size=2,
-                input_width=224 if config is None else image_size,
-                input_height=224 if config is None else image_size,
-                input_channels=3 if config is None else config.num_channels,
-            )
-        else:
-            kwargs = dict(
-                batch_size=2,
-                input_width=config.image_size[0],
-                input_height=config.image_size[1],
-                input_channels=config.num_channels,
-            )
-        fct = get_inputs_for_image_classification  # type: ignore
-    elif task == "zero-shot-image-classification":
-        if config is not None:
-            check_hasattr(config, "vision_config", "text_config")
-            check_hasattr(config.vision_config, "image_size", "num_channels")
-            check_hasattr(config.text_config, "vocab_size")
-        kwargs = dict(
-            batch_size=2,
-            batch_size_image=3,
-            sequence_length=30,
-            dummy_max_token_id=(
-                49408 if config is None else (config.text_config.vocab_size - 1)
-            ),
-            input_width=224 if config is None else config.vision_config.image_size,
-            input_height=224 if config is None else config.vision_config.image_size,
-            input_channels=3 if config is None else config.vision_config.num_channels,
-        )
-        fct = get_inputs_for_zero_shot_image_classification  # type: ignore
-    elif task == "image-text-to-text":
-        if config is not None:
-            check_hasattr(
-                config,
-                "vocab_size",
-                "hidden_size",
-                "num_attention_heads",
-                ("num_key_value_heads", "num_attention_heads"),
-                "intermediate_size",
-                "hidden_size",
-                "vision_config",
-            )
-            check_hasattr(config.vision_config, "image_size", "num_channels")
-        kwargs = dict(
-            batch_size=2,
-            sequence_length=30,
-            sequence_length2=3,
-            head_dim=(
-                16
-                if config is None
-                else getattr(
-                    config, "head_dim", config.hidden_size // config.num_attention_heads
-                )
-            ),
-            dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
-            num_hidden_layers=4 if config is None else config.num_hidden_layers,
-            num_key_value_heads=(
-                8
-                if config is None
-                else _pick(config, "num_key_value_heads", "num_attention_heads")
-            ),
-            intermediate_size=1024 if config is None else config.intermediate_size,
-            hidden_size=512 if config is None else config.hidden_size,
-            width=224 if config is None else config.vision_config.image_size,
-            height=224 if config is None else config.vision_config.image_size,
-            num_channels=3 if config is None else config.vision_config.num_channels,
-        )
-        fct = get_inputs_for_image_text_to_text  # type: ignore
-    elif task == "automatic-speech-recognition":
-        if config is not None:
-            check_hasattr(
-                config,
-                "d_model",
-                "decoder_attention_heads",
-                "decoder_layers",
-                "encoder_attention_heads",
-                "encoder_layers",
-                "max_source_positions",
-                "num_hidden_layers",
-                "vocab_size",
-            )
-        kwargs = dict(
-            batch_size=2,
-            sequence_length=30,
-            dummy_max_token_id=31000 if config is None else config.vocab_size,
-            max_source_positions=1500 if config is None else config.max_source_positions,
-            d_model=384 if config is None else config.d_model,
-            num_hidden_layers=4 if config is None else config.num_hidden_layers,
-            encoder_attention_heads=6 if config is None else config.encoder_attention_heads,
-            encoder_layers=4 if config is None else config.encoder_layers,
-            decoder_attention_heads=6 if config is None else config.decoder_attention_heads,
-            decoder_layers=4 if config is None else config.decoder_layers,
-            head_dim=(
-                64 if config is None else (config.d_model // config.encoder_attention_heads)
-            ),
-        )
-        fct = get_inputs_for_speech_automatic_recognition  # type: ignore
-    else:
-        raise NotImplementedError(f"Input generation for task {task!r} not implemented yet.")
-    return kwargs, fct
+from ...helpers.config_helper import update_config
+from ...tasks import reduce_model_config, random_input_kwargs
+from .hub_api import task_from_arch, get_pretrained_config
 
 
 def get_untrained_model_with_inputs(
@@ -501,506 +141,3 @@ def compute_model_size(model: torch.nn.Module) -> Tuple[int, int]:
         param_size += param.nelement() * param.element_size()
         nparams += param.nelement()
     return param_size, nparams
-
-
-def get_inputs_for_image_classification(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    input_width: int,
-    input_height: int,
-    input_channels: int,
-    batch_size: int = 2,
-    dynamic_rope: bool = False,
-    **kwargs,
-):
-    """
-    Generates inputs for task ``image-classification``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param batch_size: batch size
-    :param input_channels: input channel
-    :param input_width: input width
-    :param input_height: input height
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-    """
-    assert isinstance(
-        input_width, int
-    ), f"Unexpected type for input_width {type(input_width)}{config}"
-    assert isinstance(
-        input_width, int
-    ), f"Unexpected type for input_height {type(input_height)}{config}"
-
-    shapes = {
-        "pixel_values": {
-            0: torch.export.Dim("batch", min=1, max=1024),
-            2: torch.export.Dim("width", min=1, max=4096),
-            3: torch.export.Dim("height", min=1, max=4096),
-        },
-    }
-    inputs = dict(
-        pixel_values=torch.randn(batch_size, input_channels, input_width, input_height).clamp(
-            -1, 1
-        ),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_inputs_for_zero_shot_image_classification(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    dummy_max_token_id: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    input_width: int = 224,
-    input_height: int = 224,
-    input_channels: int = 3,
-    batch_size_image=3,
-    **kwargs,
-):
-    """
-    Generates inputs for task ``zero-short-image-classification``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param dummy_max_token_id: vocabulary size
-    :param batch_size: batch size
-    :param sequence_length: sequence length
-    :param batch_size_image: number of images
-    :param input_channels: input channel
-    :param input_width: input width
-    :param input_height: input height
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-
-    # input_ids:T7s2x7
-    # attention_mask:T7s2x7
-    # pixel_values:T1s2x3x224x224
-    """
-    assert isinstance(
-        input_width, int
-    ), f"Unexpected type for input_width {type(input_width)}{config}"
-    assert isinstance(
-        input_width, int
-    ), f"Unexpected type for input_height {type(input_height)}{config}"
-
-    batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-    shapes = {
-        "inputs_ids": {0: batch, 1: seq_length},
-        "attention_mask": {0: batch, 1: seq_length},
-        "pixel_values": {
-            0: torch.export.Dim("batch_img", min=1, max=1024),
-            # 2: torch.export.Dim("width", min=1, max=4096),
-            # 3: torch.export.Dim("height", min=1, max=4096),
-        },
-    }
-    inputs = dict(
-        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
-            torch.int64
-        ),
-        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
-        pixel_values=torch.randn(
-            batch_size_image, input_channels, input_width, input_height
-        ).clamp(-1, 1),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_inputs_for_text_generation(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    dummy_max_token_id: int,
-    num_key_value_heads: int,
-    num_hidden_layers: int,
-    head_dim: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    sequence_length2: int = 3,
-    dynamic_rope: bool = False,
-    **kwargs,
-):
-    """
-    Generates input for task ``text-generation``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param head_dim: last dimension of the cache
-    :param dummy_max_token_id: dummy max token id
-    :param batch_size: batch size
-    :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
-    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-    """
-    if head_dim is None:
-        assert config, "head_dim is None, the value cannot be set without a configuration"
-        head_dim = config.hidden_size // config.num_attention_heads
-    batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
-
-    shapes = {
-        "input_ids": {0: batch, 1: seq_length},
-        "attention_mask": {
-            0: batch,
-            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
-        },
-        "position_ids": {
-            0: batch,
-            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
-        },
-        "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-        ],
-    }
-    inputs = dict(
-        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
-            torch.int64
-        ),
-        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
-            torch.int64
-        ),
-        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
-        .to(torch.int64)
-        .expand((batch_size, -1)),
-        past_key_values=make_dynamic_cache(
-            [
-                (
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
-                )
-                for i in range(num_hidden_layers)
-            ]
-        ),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_inputs_for_image_text_to_text(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    dummy_max_token_id: int,
-    num_key_value_heads: int,
-    num_hidden_layers: int,
-    head_dim: int,
-    width: int,
-    height: int,
-    num_channels: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    sequence_length2: int = 3,
-    n_images: int = 2,
-    dynamic_rope: bool = False,
-    **kwargs,
-):
-    """
-    Generates input for task ``text-generation``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param head_dim: last dimension of the cache
-    :param dummy_max_token_id: dummy max token id
-    :param batch_size: batch size
-    :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
-    :param n_images: number of images
-    :param width: width of the image
-    :param height: height of the image
-    :param num_channels: number of channels
-    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-    """
-    batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
-    images = torch.export.Dim("images", min=1, max=4096)
-
-    shapes = {
-        "input_ids": {0: batch, 1: seq_length},
-        "attention_mask": {
-            0: batch,
-            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
-        },
-        "position_ids": {
-            0: batch,
-            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
-        },
-        "past_key_values": [
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-        ],
-        "pixel_values": {0: batch, 1: images},
-        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
-    }
-    inputs = dict(
-        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
-            torch.int64
-        ),
-        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
-            torch.int64
-        ),
-        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
-        .to(torch.int64)
-        .expand((batch_size, -1)),
-        past_key_values=make_dynamic_cache(
-            [
-                (
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
-                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
-                )
-                for i in range(num_hidden_layers)
-            ]
-        ),
-        image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
-            torch.int64
-        ),
-        pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
-            torch.int64
-        ),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_inputs_for_text2text_generation(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    dummy_max_token_id: int,
-    num_key_value_heads: int,
-    num_hidden_layers: int,
-    head_dim: int,
-    encoder_dim: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    sequence_length2: int = 3,
-    **kwargs,
-):
-    """
-    Generates input for task ``text2text-generation``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param head_dim: last dimension of the cache
-    :param dummy_max_token_id: dummy max token id
-    :param batch_size: batch size
-    :param encoder_dim: last dimension of encoder_last_hidden_state
-    :param sequence_length: sequence length
-    :param sequence_length2: new sequence length
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-
-    Stolen inputs for one model.
-
-    ::
-
-        cache_position:T7s1
-        past_key_values:EncoderDecoderCache(
-            self_attention_cache=DynamicCache(
-                key_cache=#6[T1s1x8x1x64,...],
-                value_cache=#6[T1s1x8x1x64,...]),
-            cross_attention_cache=DynamicCache(
-                key_cache=#6[T1s1x8x16x64,...],
-                value_cache=#6[T1s1x8x16x64,...])),
-        decoder_input_ids:T7s1x1,
-        encoder_outputs:dict(last_hidden_state:T1s1x16x512)
-    """
-    batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
-    cache_length2 = torch.export.Dim("cache_length2", min=1, max=4096)
-
-    shapes = {
-        "input_ids": {0: batch, 1: seq_length},
-        "decoder_input_ids": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        "attention_mask": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        # "cache_position": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        "past_key_values": [
-            [
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-                [{0: batch, 2: cache_length2} for _ in range(num_hidden_layers)],
-            ],
-        ],
-        # one these is selected based on the forward method signature
-        # "encoder_last_hidden_state": {0: batch, 1: torch.export.Dim.DYNAMIC},
-        # "encoder_outputs": {0: batch, 1: torch.export.Dim.DYNAMIC},
-    }
-    inputs = dict(
-        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length)).to(
-            torch.int64
-        ),
-        decoder_input_ids=torch.randint(
-            0, dummy_max_token_id, (batch_size, sequence_length2)
-        ).to(torch.int64),
-        attention_mask=torch.ones((batch_size, sequence_length)).to(torch.int64),
-        # cache_position=torch.arange(sequence_length, sequence_length + sequence_length2)
-        # .to(torch.int64)
-        # .expand((batch_size, -1)),
-        past_key_values=make_encoder_decoder_cache(
-            make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-            make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length2, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, num_key_value_heads, sequence_length2, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-        ),
-        # one these is selected based on the forward method signature
-        # encoder_last_hidden_state=torch.randn(batch_size, sequence_length2, encoder_dim),
-        # encoder_outputs=torch.randn(batch_size, sequence_length2, encoder_dim),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_inputs_for_speech_automatic_recognition(
-    model: torch.nn.Module,
-    config: Optional[Any],
-    dummy_max_token_id: int,
-    max_source_positions: int,
-    d_model: int,
-    num_hidden_layers: int,
-    encoder_attention_heads: int,
-    encoder_layers: int,
-    decoder_layers: int,
-    head_dim: int,
-    batch_size: int = 2,
-    sequence_length: int = 30,
-    **kwargs,
-):
-    """
-    Generates input for task ``text2text-generation``.
-
-    :param model: model to get the missing information
-    :param config: configuration used to generate the model
-    :param batch_size: batch size
-    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
-    :return: dictionary
-
-    Stolen inputs for one model.
-
-    ::
-
-        dict(
-            cache_position:T7s4,
-            past_key_values:EncoderDecoderCache(
-                self_attention_cache=DynamicCache[serialized](#2[#0[],#0[]]),
-                cross_attention_cache=DynamicCache[serialized](#2[#0[],#0[]])
-            ),
-            decoder_input_ids:T7s1x4,
-            encoder_outputs:BaseModelOutput(last_hidden_state:T1s1x1500x384),
-            use_cache:bool,return_dict:bool
-        )
-        dict(
-            cache_position:T7s1,
-            past_key_values:EncoderDecoderCache(
-                self_attention_cache=DynamicCache[serialized](#2[
-                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64],
-                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64]
-                ]),
-                cross_attention_cache=DynamicCache[serialized](#2[
-                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64],
-                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64]
-                ]),
-            ),
-            decoder_input_ids:T7s1x1,
-            encoder_outputs:BaseModelOutput(last_hidden_state:T1s1x1500x384),
-            use_cache:bool,return_dict:bool
-        )
-    """
-    batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-
-    shapes = {
-        "decoder_input_ids": {0: batch, 1: seq_length},
-        "cache_position": {0: seq_length},
-        "encoder_outputs": [{0: batch}],
-        "past_key_values": [
-            [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch} for _ in range(num_hidden_layers)],
-            ],
-            [
-                [{0: batch} for _ in range(num_hidden_layers)],
-                [{0: batch} for _ in range(num_hidden_layers)],
-            ],
-        ],
-    }
-    inputs = dict(
-        decoder_input_ids=torch.randint(
-            0, dummy_max_token_id, (batch_size, sequence_length)
-        ).to(torch.int64),
-        cache_position=(torch.arange(sequence_length) + 5).to(torch.int64),
-        encoder_outputs=transformers.modeling_outputs.BaseModelOutput(
-            last_hidden_state=torch.randn(batch_size, max_source_positions, d_model)
-        ),
-        past_key_values=make_encoder_decoder_cache(
-            make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, encoder_attention_heads, encoder_layers, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, encoder_attention_heads, encoder_layers, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-            make_dynamic_cache(
-                [
-                    (
-                        torch.randn(
-                            batch_size, encoder_attention_heads, max_source_positions, head_dim
-                        ),
-                        torch.randn(
-                            batch_size, encoder_attention_heads, max_source_positions, head_dim
-                        ),
-                    )
-                    for i in range(num_hidden_layers)
-                ]
-            ),
-        ),
-        # one these is selected based on the forward method signature
-        # encoder_last_hidden_state=torch.randn(batch_size, sequence_length2, encoder_dim),
-        # encoder_outputs=torch.randn(batch_size, sequence_length2, encoder_dim),
-    )
-    return dict(inputs=inputs, dynamic_shapes=shapes)
-
-
-def get_get_inputs_function_for_tasks() -> Dict[str, Callable]:
-    """Returns all the function producing dummy inputs for every task."""
-    return {
-        "automatic-speech-recognition": get_inputs_for_speech_automatic_recognition,
-        "image-classification": get_inputs_for_image_classification,
-        "image-text-to-text": get_inputs_for_image_text_to_text,
-        "text-generation": get_inputs_for_text_generation,
-        "text2text-generation": get_inputs_for_text2text_generation,
-        "zero-shot-image-classification": get_inputs_for_zero_shot_image_classification,
-    }