From f2b288a9f6526d8d5f2488fd987fdac73edee9f0 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 19 Nov 2024 16:50:53 +0100
Subject: [PATCH 1/7] initial support

---
 examples/pytorch_vlm.yaml                     |  42 +++
 optimum_benchmark/backends/timm_utils.py      |  33 +-
 .../backends/transformers_utils.py            |  66 ++--
 .../generators/task_generator.py              | 354 +++++++++++++++++-
 optimum_benchmark/task_utils.py               |   1 +
 5 files changed, 441 insertions(+), 55 deletions(-)
 create mode 100644 examples/pytorch_vlm.yaml

diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml
new file mode 100644
index 000000000..f11c4fcb3
--- /dev/null
+++ b/examples/pytorch_vlm.yaml
@@ -0,0 +1,42 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: process
+  - backend: pytorch
+  - _base_
+  - _self_
+
+name: pytorch_vlm
+
+launcher:
+  device_isolation: true
+  device_isolation_action: warn
+
+backend:
+  device: cuda
+  device_ids: 0
+  no_weights: true
+  torch_dtype: float16
+  model: Qwen/Qwen2-VL-7B-Instruct
+
+scenario:
+  memory: true
+  latency: true
+
+  warmup_runs: 10
+  iterations: 10
+  duration: 10
+
+  input_shapes:
+    # text
+    batch_size: 1
+    sequence_length: 256
+    # image
+    num_images: 1
+    num_channels: 3
+    height: 224
+    width: 224
+
+  generate_kwargs:
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index 941e09917..d7b11b1dc 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -35,15 +35,17 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
     shapes = {}
 
     # image input
-    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
-    if shapes["num_channels"] is None:
-        # processors have different names for the number of channels
+    if "num_channels" in artifacts_dict:
+        shapes["num_channels"] = artifacts_dict.get("num_channels", None)
+    elif "channels" in artifacts_dict:
         shapes["num_channels"] = artifacts_dict.get("channels", None)
 
-    image_size = artifacts_dict.get("image_size", None)
-    if image_size is None:
-        # processors have different names for the image size
-        image_size = artifacts_dict.get("size", None)
+    if "image_size" in artifacts_dict:
+        image_size = artifacts_dict["image_size"]
+    elif "size" in artifacts_dict:
+        image_size = artifacts_dict["size"]
+    else:
+        image_size = None
 
     if isinstance(image_size, (int, float)):
         shapes["height"] = image_size
@@ -57,24 +59,19 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
     elif isinstance(image_size, dict) and len(image_size) == 1:
         shapes["height"] = list(image_size.values())[0]
         shapes["width"] = list(image_size.values())[0]
-    else:
-        shapes["height"] = None
-        shapes["width"] = None
 
-    input_size = artifacts_dict.get("input_size", None)
-    if input_size is not None:
+    if "input_size" in artifacts_dict:
+        input_size = artifacts_dict.get("input_size", None)
         shapes["num_channels"] = input_size[0]
         shapes["height"] = input_size[1]
         shapes["width"] = input_size[2]
 
     # classification labels
-    id2label = artifacts_dict.get("id2label", None)
-    if id2label is not None:
+    if "id2label" in artifacts_dict:
+        id2label = artifacts_dict["id2label"]
         shapes["num_labels"] = len(id2label)
-
-    num_classes = artifacts_dict.get("num_classes", None)
-    if num_classes is not None:
-        shapes["num_labels"] = num_classes
+    elif "num_classes" in artifacts_dict:
+        shapes["num_labels"] = artifacts_dict["num_classes"]
 
     return shapes
 
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 3c7ecdcd6..009c53688 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -47,6 +47,7 @@
     "image-to-text": "AutoModelForVision2Seq",
     "text-generation": "AutoModelForCausalLM",
     "text2text-generation": "AutoModelForSeq2SeqLM",
+    "image-text-to-text": "AutoModelForImageTextToText",
     "visual-question-answering": "AutoModelForVisualQuestionAnswering",
     "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
 }
@@ -125,22 +126,27 @@ def extract_transformers_shapes_from_artifacts(
     shapes = {}
 
     # text input
-    shapes["vocab_size"] = artifacts_dict.get("vocab_size", None)
-    shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", None)
-    shapes["max_position_embeddings"] = artifacts_dict.get("max_position_embeddings", None)
-    if shapes["max_position_embeddings"] is None:
-        shapes["max_position_embeddings"] = artifacts_dict.get("n_positions", None)
+    if "vocab_size" in artifacts_dict:
+        shapes["vocab_size"] = artifacts_dict["vocab_size"]
+
+    if "type_vocab_size" in artifacts_dict:
+        shapes["type_vocab_size"] = artifacts_dict["type_vocab_size"]
+
+    if "max_position_embeddings" in artifacts_dict:
+        shapes["max_position_embeddings"] = artifacts_dict["max_position_embeddings"]
+    elif "n_positions" in artifacts_dict:
+        shapes["max_position_embeddings"] = artifacts_dict["n_positions"]
 
     # image input
-    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
-    if shapes["num_channels"] is None:
-        # processors have different names for the number of channels
+    if "num_channels" in artifacts_dict:
         shapes["num_channels"] = artifacts_dict.get("channels", None)
 
-    image_size = artifacts_dict.get("image_size", None)
-    if image_size is None:
-        # processors have different names for the image size
-        image_size = artifacts_dict.get("size", None)
+    if "image_size" in artifacts_dict:
+        image_size = artifacts_dict["image_size"]
+    elif "size" in artifacts_dict:
+        image_size = artifacts_dict["size"]
+    else:
+        image_size = None
 
     if isinstance(image_size, (int, float)):
         shapes["height"] = image_size
@@ -154,29 +160,37 @@ def extract_transformers_shapes_from_artifacts(
     elif isinstance(image_size, dict) and len(image_size) == 1:
         shapes["height"] = list(image_size.values())[0]
         shapes["width"] = list(image_size.values())[0]
-    else:
-        shapes["height"] = None
-        shapes["width"] = None
 
-    input_size = artifacts_dict.get("input_size", None)
-    if input_size is not None:
+    if "input_size" in artifacts_dict:
+        input_size = artifacts_dict["input_size"]
         shapes["num_channels"] = input_size[0]
         shapes["height"] = input_size[1]
         shapes["width"] = input_size[2]
 
     # classification labels
-    id2label = artifacts_dict.get("id2label", None)
-    if id2label is not None:
+    if "id2label" in artifacts_dict:
+        id2label = artifacts_dict["id2label"]
         shapes["num_labels"] = len(id2label)
-
-    num_classes = artifacts_dict.get("num_classes", None)
-    if num_classes is not None:
-        shapes["num_labels"] = num_classes
+    elif "num_classes" in artifacts_dict:
+        shapes["num_labels"] = artifacts_dict["num_classes"]
 
     # object detection labels
-    shapes["num_queries"] = artifacts_dict.get("num_queries", None)
-    if shapes["num_queries"] == 0:
-        shapes["num_queries"] = 2
+    if "num_queries" in artifacts_dict:
+        shapes["num_queries"] = artifacts_dict["num_queries"]
+
+    # image-text input
+    if "image_token_id" in artifacts_dict:
+        shapes["image_token_id"] = artifacts_dict["image_token_id"]
+
+    if "vision_config" in artifacts_dict:
+        if "in_chans" in artifacts_dict["vision_config"]:
+            shapes["num_channels"] = artifacts_dict["vision_config"]["in_chans"]
+        if "patch_size" in artifacts_dict["vision_config"]:
+            shapes["patch_size"] = artifacts_dict["vision_config"]["patch_size"]
+        if "temporal_patch_size" in artifacts_dict["vision_config"]:
+            shapes["temporal_patch_size"] = artifacts_dict["vision_config"]["temporal_patch_size"]
+        if "spatial_merge_size" in artifacts_dict["vision_config"]:
+            shapes["spatial_merge_size"] = artifacts_dict["vision_config"]["spatial_merge_size"]
 
     return shapes
 
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 761315780..411130355 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -2,7 +2,7 @@
 import random
 import string
 from abc import ABC
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 # TODO: drop torch dependency and use numpy instead
 import torch
@@ -15,10 +15,18 @@
 
 
 class TaskGenerator(ABC):
-    def __init__(self, shapes, with_labels: bool):
+    def __init__(self, shapes: Dict[str, int], with_labels: bool):
         self.shapes = shapes
         self.with_labels = with_labels
 
+    @staticmethod
+    def generate_constant_integers(value: int, shape: Tuple[int]):
+        return torch.full(shape, value, dtype=torch.int64)
+
+    @staticmethod
+    def generate_constant_floats(value: float, shape: Tuple[int]):
+        return torch.full(shape, value, dtype=torch.float32)
+
     @staticmethod
     def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]):
         return torch.randint(min_value, max_value, shape)
@@ -44,27 +52,62 @@ def __call__(self):
 
 class TextGenerator(TaskGenerator):
     def input_ids(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate input ids."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids."
+        )
+
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["vocab_size"] or DEFAULT_VOCAB_SIZE,
+            max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE,
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def attention_mask(self):
-        return self.generate_random_integers(
-            min_value=1,  # avoid sparse attention
-            max_value=2,
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate attention masks."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate attention masks."
+        )
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def token_type_ids(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate token type ids."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate token type ids."
+        )
+
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["type_vocab_size"] or DEFAULT_TYPE_VOCAB_SIZE,
+            max_value=self.shapes.get("type_vocab_size", None) or DEFAULT_TYPE_VOCAB_SIZE,
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def position_ids(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate position ids."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate position ids."
+        )
+
         return self.generate_ranges(
             start=0,
             stop=self.shapes["sequence_length"],
@@ -72,14 +115,33 @@ def position_ids(self):
         )
 
     def requires_token_type_ids(self):
-        return self.shapes["type_vocab_size"] is not None and self.shapes["type_vocab_size"] > 1
+        return self.shapes.get("type_vocab_size", None) is not None and self.shapes["type_vocab_size"] > 1
 
     def requires_position_ids(self):
-        return self.shapes["max_position_embeddings"] is not None
+        return (
+            self.shapes.get("max_position_embeddings", None) is not None and self.shapes["max_position_embeddings"] > 1
+        )
 
 
 class ImageGenerator(TaskGenerator):
     def pixel_values(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate pixel values."
+        )
+        assert self.shapes.get("num_channels", None) is not None, (
+            "Number of channels couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values."
+        )
+        assert self.shapes.get("height", None) is not None, (
+            "Height couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `height` to be able to generate pixel values."
+        )
+        assert self.shapes.get("width", None) is not None, (
+            "Width couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `width` to be able to generate pixel values."
+        )
+
         return self.generate_random_floats(
             min_value=0,
             max_value=1,
@@ -89,11 +151,32 @@ def pixel_values(self):
 
 class AudioGenerator(TaskGenerator):
     def input_values(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate input values."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate input values."
+        )
         return self.generate_random_floats(
             min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"])
         )
 
     def input_features(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate input features."
+        )
+        assert self.shapes.get("feature_size", None) is not None, (
+            "Feature size couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `feature_size` to be able to generate input features."
+        )
+        assert self.shapes.get("nb_max_frames", None) is not None, (
+            "Number of max frames couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `nb_max_frames` to be able to generate input features."
+        )
+
         return self.generate_random_floats(
             min_value=-1,
             max_value=1,
@@ -103,8 +186,15 @@ def input_features(self):
 
 class TextClassificationGenerator(TextGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
-            min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],)
+            min_value=0,
+            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            shape=(self.shapes["batch_size"],),
         )
 
     def __call__(self):
@@ -127,6 +217,15 @@ def __call__(self):
 
 class TokenClassificationGenerator(TextGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
@@ -177,11 +276,29 @@ def __call__(self):
 
 class QuestionAnsweringGenerator(TextGenerator):
     def start_positions(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate start positions."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate start positions."
+        )
+
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
         )
 
     def end_positions(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate end positions."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate end positions."
+        )
+
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
         )
@@ -221,6 +338,15 @@ def __call__(self):
 
 class MultipleChoiceGenerator(TextGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("num_choices", None) is not None, (
+            "Number of choices must be provided, "
+            "please provide it in `input_shapes` as `num_choices` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],)
         )
@@ -255,8 +381,15 @@ def __call__(self):
 
 class ImageClassificationGenerator(ImageGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
-            min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],)
+            min_value=0,
+            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            shape=(self.shapes["batch_size"],),
         )
 
     def __call__(self):
@@ -271,6 +404,15 @@ def __call__(self):
 
 class ObjectDetectionGenerator(ImageGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("num_queries", None) is not None, (
+            "Number of queries must be provided, "
+            "please provide it in `input_shapes` as `num_queries` to be able to generate labels."
+        )
+
         return [
             {
                 "class_labels": self.generate_random_integers(
@@ -295,6 +437,19 @@ def __call__(self):
 
 class SemanticSegmentationGenerator(ImageGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("height", None) is not None, (
+            "Height couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `height` to be able to generate labels."
+        )
+        assert self.shapes.get("width", None) is not None, (
+            "Width couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `width` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
@@ -313,6 +468,15 @@ def __call__(self):
 
 class AudioClassificationGenerator(AudioGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("num_labels", None) is not None, (
+            "Number of labels couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `num_labels` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],)
         )
@@ -329,6 +493,19 @@ def __call__(self):
 
 class AutomaticSpeechRecognitionGenerator(AudioGenerator):
     def labels(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate labels."
+        )
+        assert self.shapes.get("num_labels", None) is not None, (
+            "Number of labels couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `num_labels` to be able to generate labels."
+        )
+
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["vocab_size"] or DEFAULT_TYPE_VOCAB_SIZE,
@@ -347,6 +524,11 @@ def __call__(self):
 
 class PromptGenerator(TaskGenerator):
     def prompt(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate prompts."
+        )
+
         return self.generate_random_strings(num_seq=self.shapes["batch_size"])
 
     def __call__(self):
@@ -375,6 +557,155 @@ def __call__(self):
         return dummy
 
 
+class ImageTextToTextGenerationGenerator(TextGenerator, ImageGenerator):
+    def input_ids(self):
+        assert self.shapes.get("batch_size", None) is not None, (
+            "Batch size must be provided, "
+            "please provide it in `input_shapes` as `batch_size` to be able to generate input ids."
+        )
+        assert self.shapes.get("sequence_length", None) is not None, (
+            "Sequence length must be provided, "
+            "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids."
+        )
+        assert self.shapes.get("num_images", None) is not None, (
+            "Number of images must be provided, "
+            "please provide it in `input_shapes` as `num_images` to be able to generate input ids."
+        )
+        assert self.shapes.get("num_channels", None) is not None, (
+            "Number of channels couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `num_channels` to be able to generate input ids."
+        )
+        assert self.shapes.get("height", None) is not None, (
+            "Height couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `height` to be able to generate input ids."
+        )
+        assert self.shapes.get("width", None) is not None, (
+            "Width couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `width` to be able to generate input ids."
+        )
+        assert self.shapes.get("patch_size", None) is not None, (
+            "Patch size must be provided, "
+            "please provide it in `input_shapes` as `patch_size` to be able to generate input ids."
+        )
+        assert self.shapes.get("temporal_patch_size", None) is not None, (
+            "Temporal patch size must be provided, "
+            "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate input ids."
+        )
+        assert self.shapes.get("spatial_merge_size", None) is not None, (
+            "Spatial merge size must be provided, "
+            "please provide it in `input_shapes` as `spatial_merge_size` to be able to generate input ids."
+        )
+        assert self.shapes.get("image_token_id", None) is not None, (
+            "Image token id must be provided, "
+            "please provide it in `input_shapes` as `image_token_id` to be able to generate input ids."
+        )
+
+        text_tokens = self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE,
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"],
+            ),
+        )
+        image_tokens = self.generate_constant_integers(
+            value=self.shapes["image_token_id"],
+            shape=(
+                self.shapes["batch_size"],
+                int(
+                    self.shapes["num_images"]
+                    * self.shapes["height"]
+                    * self.shapes["width"]
+                    / self.shapes["temporal_patch_size"]
+                    / self.shapes["spatial_merge_size"]
+                    / self.shapes["patch_size"] ** 2
+                ),
+            ),
+        )
+
+        return torch.cat((text_tokens, image_tokens), dim=1)
+
+    def pixel_values(self):
+        assert self.shapes.get("num_images", None) is not None, (
+            "Number of images must be provided, "
+            "please provide it in `input_shapes` as `num_images` to be able to generate pixel values."
+        )
+        assert self.shapes.get("num_channels", None) is not None, (
+            "Number of channels couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values."
+        )
+        assert self.shapes.get("height", None) is not None, (
+            "Height couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `height` to be able to generate pixel values."
+        )
+        assert self.shapes.get("width", None) is not None, (
+            "Width couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `width` to be able to generate pixel values."
+        )
+        assert self.shapes.get("patch_size", None) is not None, (
+            "Patch size must be provided, "
+            "please provide it in `input_shapes` as `patch_size` to be able to generate pixel values."
+        )
+        assert self.shapes.get("temporal_patch_size", None) is not None, (
+            "Temporal patch size must be provided, "
+            "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate pixel values."
+        )
+
+        return self.generate_random_floats(
+            min_value=0,
+            max_value=1,
+            shape=(
+                self.shapes["num_images"]
+                * int(self.shapes["height"] / self.shapes["patch_size"])
+                * int(self.shapes["width"] / self.shapes["patch_size"]),
+                self.shapes["num_channels"]
+                * self.shapes["patch_size"]
+                * self.shapes["patch_size"]
+                * self.shapes["temporal_patch_size"],
+            ),
+        )
+
+    def image_grid_thw(self):
+        assert self.shapes.get("num_images", None) is not None, (
+            "Number of images must be provided, "
+            "please provide it in `input_shapes` as `num_images` to be able to generate image grid."
+        )
+        assert self.shapes.get("height", None) is not None, (
+            "Height couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `height` to be able to generate image grid."
+        )
+        assert self.shapes.get("width", None) is not None, (
+            "Width couldn't be inferred automatically from model, "
+            "please provide it in `input_shapes` as `width` to be able to generate image grid."
+        )
+
+        return torch.tensor(
+            [
+                [
+                    self.shapes["num_images"],
+                    int(self.shapes["height"] / self.shapes["patch_size"]),
+                    int(self.shapes["width"] / self.shapes["patch_size"]),
+                ]
+            ]
+        )
+
+    def __call__(self):
+        dummy = {}
+
+        dummy["input_ids"] = self.input_ids()
+        dummy["pixel_values"] = self.pixel_values()
+        dummy["image_grid_thw"] = self.image_grid_thw()
+
+        print("input_ids", dummy["input_ids"].shape)
+        print("pixel_values", dummy["pixel_values"].shape)
+        print("image_grid_thw", dummy["image_grid_thw"].shape)
+
+        if self.with_labels:
+            dummy["labels"] = self.input_ids()
+
+        return dummy
+
+
 TASKS_TO_GENERATORS = {
     # transformers models tasks
     "feature-extraction": FeatureExtractionGenerator,
@@ -388,6 +719,7 @@ def __call__(self):
     "image-classification": ImageClassificationGenerator,
     "object-detection": ObjectDetectionGenerator,
     "semantic-segmentation": SemanticSegmentationGenerator,
+    "image-text-to-text": ImageTextToTextGenerationGenerator,
     # diffusers pipelines tasks
     "text-to-image": PromptGenerator,
     "stable-diffusion": PromptGenerator,
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index 337e835ec..0a2a98c2b 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -47,6 +47,7 @@
     "image-to-text",
     "conversational",
     "text-generation",
+    "image-text-to-text",
     "text2text-generation",
     "automatic-speech-recognition",
 ]

From f2a7a2c67c4269e425d202229b65716dac40fb8c Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 20 Nov 2024 07:38:12 +0100
Subject: [PATCH 2/7] clean up

---
 .../generators/task_generator.py              | 34 +++++++++++++------
 .../scenarios/inference/config.py             |  5 ++-
 .../scenarios/inference/scenario.py           |  4 +--
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 411130355..b99c918c9 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -145,7 +145,12 @@ def pixel_values(self):
         return self.generate_random_floats(
             min_value=0,
             max_value=1,
-            shape=(self.shapes["batch_size"], self.shapes["num_channels"], self.shapes["height"], self.shapes["width"]),
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["num_channels"],
+                self.shapes["height"],
+                self.shapes["width"],
+            ),
         )
 
 
@@ -160,7 +165,12 @@ def input_values(self):
             "please provide it in `input_shapes` as `sequence_length` to be able to generate input values."
         )
         return self.generate_random_floats(
-            min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"])
+            min_value=-1,
+            max_value=1,
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"],
+            ),
         )
 
     def input_features(self):
@@ -180,7 +190,11 @@ def input_features(self):
         return self.generate_random_floats(
             min_value=-1,
             max_value=1,
-            shape=(self.shapes["batch_size"], self.shapes["feature_size"], self.shapes["nb_max_frames"]),
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["feature_size"],
+                self.shapes["nb_max_frames"],
+            ),
         )
 
 
@@ -286,7 +300,9 @@ def start_positions(self):
         )
 
         return self.generate_random_integers(
-            min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
+            min_value=0,
+            max_value=self.shapes["sequence_length"],
+            shape=(self.shapes["batch_size"],),
         )
 
     def end_positions(self):
@@ -300,7 +316,9 @@ def end_positions(self):
         )
 
         return self.generate_random_integers(
-            min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
+            min_value=0,
+            max_value=self.shapes["sequence_length"],
+            shape=(self.shapes["batch_size"],),
         )
 
     def __call__(self):
@@ -557,7 +575,7 @@ def __call__(self):
         return dummy
 
 
-class ImageTextToTextGenerationGenerator(TextGenerator, ImageGenerator):
+class ImageTextToTextGenerationGenerator(TaskGenerator):
     def input_ids(self):
         assert self.shapes.get("batch_size", None) is not None, (
             "Batch size must be provided, "
@@ -696,10 +714,6 @@ def __call__(self):
         dummy["pixel_values"] = self.pixel_values()
         dummy["image_grid_thw"] = self.image_grid_thw()
 
-        print("input_ids", dummy["input_ids"].shape)
-        print("pixel_values", dummy["pixel_values"].shape)
-        print("image_grid_thw", dummy["image_grid_thw"].shape)
-
         if self.with_labels:
             dummy["labels"] = self.input_ids()
 
diff --git a/optimum_benchmark/scenarios/inference/config.py b/optimum_benchmark/scenarios/inference/config.py
index 2c05d97f8..57d482abf 100644
--- a/optimum_benchmark/scenarios/inference/config.py
+++ b/optimum_benchmark/scenarios/inference/config.py
@@ -7,7 +7,10 @@
 
 LOGGER = getLogger("inference")
 
-INPUT_SHAPES = {"batch_size": 2, "num_choices": 2, "sequence_length": 16}
+INPUT_SHAPES = {
+    "batch_size": 2,
+    "sequence_length": 16,
+}
 
 
 @dataclass
diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py
index f2f18e0b1..8b3bb1b76 100644
--- a/optimum_benchmark/scenarios/inference/scenario.py
+++ b/optimum_benchmark/scenarios/inference/scenario.py
@@ -414,8 +414,8 @@ def atomic_call_volume(self) -> int:  # in images
     @property
     def atomic_prefill_volume(self) -> int:  # in tokens
         if {"input_ids", "prompt", "prompts"} & set(self.inputs.keys()):
-            # text conditioned generation (1 bos token or sequence_length tokens)
-            return self.config.input_shapes["batch_size"] * max(self.config.input_shapes["sequence_length"], 1)
+            # text conditioned generation (sequence_length tokens)
+            return self.config.input_shapes["batch_size"] * self.config.input_shapes["sequence_length"]
         else:
             # image/audio conditioned generation (1 bos token)
             return self.config.input_shapes["batch_size"]

From 9a854ae7750937d7297d8777be4c65e7ff6994f0 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 20 Nov 2024 08:41:02 +0100
Subject: [PATCH 3/7] simpler

---
 optimum_benchmark/backends/base.py            |   1 -
 optimum_benchmark/backends/timm_utils.py      |   9 +-
 .../backends/transformers_utils.py            | 135 +++++----
 .../generators/dataset_generator.py           |   6 +-
 .../generators/input_generator.py             |   4 +-
 .../generators/task_generator.py              | 268 +++---------------
 test.py                                       |  53 ++++
 7 files changed, 179 insertions(+), 297 deletions(-)
 create mode 100644 test.py

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 8a59ac6d6..44d71ef4b 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -36,7 +36,6 @@
 class Backend(Generic[BackendConfigT], ABC):
     NAME: ClassVar[str]
 
-    model_type: str
     model_shapes: Dict[str, int]
 
     pretrained_model: PreTrainedModel
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index d7b11b1dc..dbaf36fd5 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict
 
 from transformers import PretrainedConfig
@@ -66,12 +67,8 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
         shapes["height"] = input_size[1]
         shapes["width"] = input_size[2]
 
-    # classification labels
-    if "id2label" in artifacts_dict:
-        id2label = artifacts_dict["id2label"]
-        shapes["num_labels"] = len(id2label)
-    elif "num_classes" in artifacts_dict:
-        shapes["num_labels"] = artifacts_dict["num_classes"]
+    if "num_classes" not in artifacts_dict:
+        warnings.warn("Could not extract shapes [num_channels, height, width] from timm model config.")
 
     return shapes
 
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 009c53688..8ecbde01f 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -1,4 +1,3 @@
-import warnings
 from contextlib import contextmanager
 from typing import Any, Dict, Optional, Union
 
@@ -7,6 +6,7 @@
 from transformers import (
     AutoConfig,
     AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoProcessor,
     AutoTokenizer,
     FeatureExtractionMixin,
@@ -67,7 +67,7 @@
 else:
     TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
 
-PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, SpecialTokensMixin, ProcessorMixin]
+PretrainedProcessor = Union["FeatureExtractionMixin", "ImageProcessingMixin", "SpecialTokensMixin", "ProcessorMixin"]
 
 
 def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig":
@@ -92,59 +92,78 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre
             return AutoFeatureExtractor.from_pretrained(model, **kwargs)
         except Exception:
             try:
-                return AutoTokenizer.from_pretrained(model, **kwargs)
+                return AutoImageProcessor.from_pretrained(model, **kwargs)
             except Exception:
-                return None
+                try:
+                    return AutoTokenizer.from_pretrained(model, **kwargs)
+                except Exception:
+                    return None
+
+
+def get_flat_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    flat_dict = {}
+    for k, v in d.items():
+        if isinstance(v, dict):
+            flat_dict.update(get_flat_dict(v))
+        else:
+            flat_dict[k] = v
+    return flat_dict
+
+
+def get_flat_artifact_dict(artifact: Union[PretrainedConfig, PretrainedProcessor]) -> Dict[str, Any]:
+    if isinstance(artifact, ProcessorMixin):
+        artifact_dict = {}
+
+        for attribute in artifact.attributes:
+            artifact_dict.update(get_flat_artifact_dict(attribute))
+    else:
+        if hasattr(artifact, "to_dict"):
+            artifact_dict = {k: v for k, v in artifact.to_dict().items() if v is not None}
+        else:
+            try:
+                artifact_dict = {k: getattr(artifact, k) for k in dir(artifact) if getattr(artifact, k) is not None}
+            except Exception:
+                pass
+
+    artifact_dict = get_flat_dict(artifact_dict)
+
+    return artifact_dict
 
 
 def extract_transformers_shapes_from_artifacts(
-    config: Optional["PretrainedConfig"] = None, processor: Optional["PretrainedProcessor"] = None
+    config: Optional["PretrainedConfig"] = None,
+    processor: Optional["PretrainedProcessor"] = None,
 ) -> Dict[str, Any]:
-    artifacts_dict = {}
+    flat_artifacts_dict = {}
 
-    if config is not None and hasattr(config, "to_dict"):
-        config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
-        artifacts_dict.update(config_dict)
-    elif config is not None:
-        try:
-            config_dict = {k: getattr(config, k) for k in dir(config) if isinstance(getattr(config, k), int)}
-            artifacts_dict.update(config_dict)
-        except Exception:
-            warnings.warn(f"Could not extract shapes from config {config}")
+    if config is not None:
+        flat_artifacts_dict.update(get_flat_artifact_dict(config))
 
-    if processor is not None and hasattr(processor, "to_dict"):
-        processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
-        artifacts_dict.update(processor_dict)
-    elif processor is not None:
-        try:
-            processor_dict = {
-                k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)
-            }
-        except Exception:
-            warnings.warn(f"Could not extract shapes from processor {processor}")
+    if processor is not None:
+        flat_artifacts_dict.update(get_flat_artifact_dict(processor))
 
     shapes = {}
 
     # text input
-    if "vocab_size" in artifacts_dict:
-        shapes["vocab_size"] = artifacts_dict["vocab_size"]
+    if "vocab_size" in flat_artifacts_dict:
+        shapes["vocab_size"] = flat_artifacts_dict["vocab_size"]
 
-    if "type_vocab_size" in artifacts_dict:
-        shapes["type_vocab_size"] = artifacts_dict["type_vocab_size"]
+    if "type_vocab_size" in flat_artifacts_dict:
+        shapes["type_vocab_size"] = flat_artifacts_dict["type_vocab_size"]
 
-    if "max_position_embeddings" in artifacts_dict:
-        shapes["max_position_embeddings"] = artifacts_dict["max_position_embeddings"]
-    elif "n_positions" in artifacts_dict:
-        shapes["max_position_embeddings"] = artifacts_dict["n_positions"]
+    if "max_position_embeddings" in flat_artifacts_dict:
+        shapes["max_position_embeddings"] = flat_artifacts_dict["max_position_embeddings"]
+    elif "n_positions" in flat_artifacts_dict:
+        shapes["max_position_embeddings"] = flat_artifacts_dict["n_positions"]
 
     # image input
-    if "num_channels" in artifacts_dict:
-        shapes["num_channels"] = artifacts_dict.get("channels", None)
+    if "num_channels" in flat_artifacts_dict:
+        shapes["num_channels"] = flat_artifacts_dict.get("channels", None)
 
-    if "image_size" in artifacts_dict:
-        image_size = artifacts_dict["image_size"]
-    elif "size" in artifacts_dict:
-        image_size = artifacts_dict["size"]
+    if "image_size" in flat_artifacts_dict:
+        image_size = flat_artifacts_dict["image_size"]
+    elif "size" in flat_artifacts_dict:
+        image_size = flat_artifacts_dict["size"]
     else:
         image_size = None
 
@@ -161,36 +180,34 @@ def extract_transformers_shapes_from_artifacts(
         shapes["height"] = list(image_size.values())[0]
         shapes["width"] = list(image_size.values())[0]
 
-    if "input_size" in artifacts_dict:
-        input_size = artifacts_dict["input_size"]
+    if "input_size" in flat_artifacts_dict:
+        input_size = flat_artifacts_dict["input_size"]
         shapes["num_channels"] = input_size[0]
         shapes["height"] = input_size[1]
         shapes["width"] = input_size[2]
 
     # classification labels
-    if "id2label" in artifacts_dict:
-        id2label = artifacts_dict["id2label"]
+    if "id2label" in flat_artifacts_dict:
+        id2label = flat_artifacts_dict["id2label"]
         shapes["num_labels"] = len(id2label)
-    elif "num_classes" in artifacts_dict:
-        shapes["num_labels"] = artifacts_dict["num_classes"]
+    elif "num_classes" in flat_artifacts_dict:
+        shapes["num_labels"] = flat_artifacts_dict["num_classes"]
 
     # object detection labels
-    if "num_queries" in artifacts_dict:
-        shapes["num_queries"] = artifacts_dict["num_queries"]
+    if "num_queries" in flat_artifacts_dict:
+        shapes["num_queries"] = flat_artifacts_dict["num_queries"]
 
     # image-text input
-    if "image_token_id" in artifacts_dict:
-        shapes["image_token_id"] = artifacts_dict["image_token_id"]
-
-    if "vision_config" in artifacts_dict:
-        if "in_chans" in artifacts_dict["vision_config"]:
-            shapes["num_channels"] = artifacts_dict["vision_config"]["in_chans"]
-        if "patch_size" in artifacts_dict["vision_config"]:
-            shapes["patch_size"] = artifacts_dict["vision_config"]["patch_size"]
-        if "temporal_patch_size" in artifacts_dict["vision_config"]:
-            shapes["temporal_patch_size"] = artifacts_dict["vision_config"]["temporal_patch_size"]
-        if "spatial_merge_size" in artifacts_dict["vision_config"]:
-            shapes["spatial_merge_size"] = artifacts_dict["vision_config"]["spatial_merge_size"]
+    if "image_token_id" in flat_artifacts_dict:
+        shapes["image_token_id"] = flat_artifacts_dict["image_token_id"]
+    if "in_chans" in flat_artifacts_dict:
+        shapes["num_channels"] = flat_artifacts_dict["in_chans"]
+    if "patch_size" in flat_artifacts_dict:
+        shapes["patch_size"] = flat_artifacts_dict["patch_size"]
+    if "temporal_patch_size" in flat_artifacts_dict:
+        shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"]
+    if "spatial_merge_size" in flat_artifacts_dict:
+        shapes["spatial_merge_size"] = flat_artifacts_dict["spatial_merge_size"]
 
     return shapes
 
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index bbaa87f0a..781e20d96 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -9,11 +9,11 @@ class DatasetGenerator:
     task_generator: TaskGenerator
 
     def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None:
-        dataset_shapes["batch_size"] = dataset_shapes["dataset_size"]
+        dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size", None)
 
         if task in TASKS_TO_GENERATORS:
-            shapes = {**dataset_shapes, **model_shapes}
-            self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=True)
+            all_shapes = {**model_shapes, **dataset_shapes}  # dataset_shapes take precedence over model_shapes
+            self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True)
         else:
             raise NotImplementedError(
                 f"Task {task} is supported. \n"
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index 1dd5501a9..10432fa95 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -8,8 +8,8 @@ class InputGenerator:
 
     def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None:
         if task in TASKS_TO_GENERATORS:
-            shapes = {**input_shapes, **model_shapes}
-            self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=False)
+            all_shapes = {**model_shapes, **input_shapes} # input_shapes take precedence over model_shapes
+            self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False)
         else:
             raise NotImplementedError(
                 f"Task {task} is not supported. "
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index b99c918c9..ccf89cdc1 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -19,6 +19,13 @@ def __init__(self, shapes: Dict[str, int], with_labels: bool):
         self.shapes = shapes
         self.with_labels = with_labels
 
+    def assert_not_missing_shapes(self, required_shapes: List[str]):
+        for shape in required_shapes:
+            assert self.shapes.get(shape, None) is not None, (
+                f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. "
+                f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. "
+            )
+
     @staticmethod
     def generate_constant_integers(value: int, shape: Tuple[int]):
         return torch.full(shape, value, dtype=torch.int64)
@@ -52,14 +59,7 @@ def __call__(self):
 
 class TextGenerator(TaskGenerator):
     def input_ids(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate input ids."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -68,14 +68,7 @@ def input_ids(self):
         )
 
     def attention_mask(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate attention masks."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate attention masks."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_constant_integers(
             value=1,  # no sparsity
@@ -83,14 +76,7 @@ def attention_mask(self):
         )
 
     def token_type_ids(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate token type ids."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate token type ids."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -99,14 +85,7 @@ def token_type_ids(self):
         )
 
     def position_ids(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate position ids."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate position ids."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_ranges(
             start=0,
@@ -125,22 +104,7 @@ def requires_position_ids(self):
 
 class ImageGenerator(TaskGenerator):
     def pixel_values(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate pixel values."
-        )
-        assert self.shapes.get("num_channels", None) is not None, (
-            "Number of channels couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values."
-        )
-        assert self.shapes.get("height", None) is not None, (
-            "Height couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `height` to be able to generate pixel values."
-        )
-        assert self.shapes.get("width", None) is not None, (
-            "Width couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `width` to be able to generate pixel values."
-        )
+        self.assert_not_missing_shapes(["batch_size", "num_channels", "height", "width"])
 
         return self.generate_random_floats(
             min_value=0,
@@ -156,14 +120,8 @@ def pixel_values(self):
 
 class AudioGenerator(TaskGenerator):
     def input_values(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate input values."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate input values."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
+
         return self.generate_random_floats(
             min_value=-1,
             max_value=1,
@@ -174,18 +132,7 @@ def input_values(self):
         )
 
     def input_features(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate input features."
-        )
-        assert self.shapes.get("feature_size", None) is not None, (
-            "Feature size couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `feature_size` to be able to generate input features."
-        )
-        assert self.shapes.get("nb_max_frames", None) is not None, (
-            "Number of max frames couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `nb_max_frames` to be able to generate input features."
-        )
+        self.assert_not_missing_shapes(["batch_size", "feature_size", "nb_max_frames"])
 
         return self.generate_random_floats(
             min_value=-1,
@@ -200,10 +147,7 @@ def input_features(self):
 
 class TextClassificationGenerator(TextGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -231,14 +175,7 @@ def __call__(self):
 
 class TokenClassificationGenerator(TextGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -290,14 +227,7 @@ def __call__(self):
 
 class QuestionAnsweringGenerator(TextGenerator):
     def start_positions(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate start positions."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate start positions."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -306,14 +236,7 @@ def start_positions(self):
         )
 
     def end_positions(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate end positions."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate end positions."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -356,14 +279,7 @@ def __call__(self):
 
 class MultipleChoiceGenerator(TextGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("num_choices", None) is not None, (
-            "Number of choices must be provided, "
-            "please provide it in `input_shapes` as `num_choices` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size", "num_choices"])
 
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],)
@@ -399,10 +315,7 @@ def __call__(self):
 
 class ImageClassificationGenerator(ImageGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -422,14 +335,7 @@ def __call__(self):
 
 class ObjectDetectionGenerator(ImageGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("num_queries", None) is not None, (
-            "Number of queries must be provided, "
-            "please provide it in `input_shapes` as `num_queries` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size", "num_queries"])
 
         return [
             {
@@ -455,18 +361,7 @@ def __call__(self):
 
 class SemanticSegmentationGenerator(ImageGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("height", None) is not None, (
-            "Height couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `height` to be able to generate labels."
-        )
-        assert self.shapes.get("width", None) is not None, (
-            "Width couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `width` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size", "height", "width"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -486,14 +381,7 @@ def __call__(self):
 
 class AudioClassificationGenerator(AudioGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("num_labels", None) is not None, (
-            "Number of labels couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `num_labels` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size"])
 
         return self.generate_random_integers(
             min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],)
@@ -511,18 +399,7 @@ def __call__(self):
 
 class AutomaticSpeechRecognitionGenerator(AudioGenerator):
     def labels(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate labels."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate labels."
-        )
-        assert self.shapes.get("num_labels", None) is not None, (
-            "Number of labels couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `num_labels` to be able to generate labels."
-        )
+        self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
@@ -542,10 +419,7 @@ def __call__(self):
 
 class PromptGenerator(TaskGenerator):
     def prompt(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate prompts."
-        )
+        self.assert_not_missing_shapes(["batch_size"])
 
         return self.generate_random_strings(num_seq=self.shapes["batch_size"])
 
@@ -577,45 +451,19 @@ def __call__(self):
 
 class ImageTextToTextGenerationGenerator(TaskGenerator):
     def input_ids(self):
-        assert self.shapes.get("batch_size", None) is not None, (
-            "Batch size must be provided, "
-            "please provide it in `input_shapes` as `batch_size` to be able to generate input ids."
-        )
-        assert self.shapes.get("sequence_length", None) is not None, (
-            "Sequence length must be provided, "
-            "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids."
-        )
-        assert self.shapes.get("num_images", None) is not None, (
-            "Number of images must be provided, "
-            "please provide it in `input_shapes` as `num_images` to be able to generate input ids."
-        )
-        assert self.shapes.get("num_channels", None) is not None, (
-            "Number of channels couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `num_channels` to be able to generate input ids."
-        )
-        assert self.shapes.get("height", None) is not None, (
-            "Height couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `height` to be able to generate input ids."
-        )
-        assert self.shapes.get("width", None) is not None, (
-            "Width couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `width` to be able to generate input ids."
-        )
-        assert self.shapes.get("patch_size", None) is not None, (
-            "Patch size must be provided, "
-            "please provide it in `input_shapes` as `patch_size` to be able to generate input ids."
-        )
-        assert self.shapes.get("temporal_patch_size", None) is not None, (
-            "Temporal patch size must be provided, "
-            "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate input ids."
-        )
-        assert self.shapes.get("spatial_merge_size", None) is not None, (
-            "Spatial merge size must be provided, "
-            "please provide it in `input_shapes` as `spatial_merge_size` to be able to generate input ids."
-        )
-        assert self.shapes.get("image_token_id", None) is not None, (
-            "Image token id must be provided, "
-            "please provide it in `input_shapes` as `image_token_id` to be able to generate input ids."
+        self.assert_not_missing_shapes(
+            [
+                "batch_size",
+                "sequence_length",
+                "num_images",
+                "num_channels",
+                "height",
+                "width",
+                "patch_size",
+                "temporal_patch_size",
+                "spatial_merge_size",
+                "image_token_id",
+            ]
         )
 
         text_tokens = self.generate_random_integers(
@@ -644,29 +492,8 @@ def input_ids(self):
         return torch.cat((text_tokens, image_tokens), dim=1)
 
     def pixel_values(self):
-        assert self.shapes.get("num_images", None) is not None, (
-            "Number of images must be provided, "
-            "please provide it in `input_shapes` as `num_images` to be able to generate pixel values."
-        )
-        assert self.shapes.get("num_channels", None) is not None, (
-            "Number of channels couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values."
-        )
-        assert self.shapes.get("height", None) is not None, (
-            "Height couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `height` to be able to generate pixel values."
-        )
-        assert self.shapes.get("width", None) is not None, (
-            "Width couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `width` to be able to generate pixel values."
-        )
-        assert self.shapes.get("patch_size", None) is not None, (
-            "Patch size must be provided, "
-            "please provide it in `input_shapes` as `patch_size` to be able to generate pixel values."
-        )
-        assert self.shapes.get("temporal_patch_size", None) is not None, (
-            "Temporal patch size must be provided, "
-            "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate pixel values."
+        self.assert_not_missing_shapes(
+            ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"]
         )
 
         return self.generate_random_floats(
@@ -684,18 +511,7 @@ def pixel_values(self):
         )
 
     def image_grid_thw(self):
-        assert self.shapes.get("num_images", None) is not None, (
-            "Number of images must be provided, "
-            "please provide it in `input_shapes` as `num_images` to be able to generate image grid."
-        )
-        assert self.shapes.get("height", None) is not None, (
-            "Height couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `height` to be able to generate image grid."
-        )
-        assert self.shapes.get("width", None) is not None, (
-            "Width couldn't be inferred automatically from model, "
-            "please provide it in `input_shapes` as `width` to be able to generate image grid."
-        )
+        self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"])
 
         return torch.tensor(
             [
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..6c96b0581
--- /dev/null
+++ b/test.py
@@ -0,0 +1,53 @@
+from transformers import AutoProcessor, Idefics2Processor
+
+processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics-9b")
+print(processor.to_dict())
+
+# dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
+# dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"
+
+# prompts = [
+#     [
+#         "User:",
+#         dogs_image_url_1,
+#         "Describe this image.\nAssistant: An image of two dogs.\n",
+#         "User:",
+#         dogs_image_url_2,
+#         "Describe this image.\nAssistant:",
+#     ]
+# ]
+
+# inputs = processor(prompts, return_tensors="pt")
+
+# print("inputs_ids", inputs["input_ids"].shape)
+# print("pixel_values", inputs["pixel_values"].shape)
+
+# batch_size = 1
+# sequence_length = 128
+
+# num_images = 1
+# num_channels = 3
+# height = 224
+# width = 224
+
+# patch_size = 14
+# temporal_patch_size = 2
+
+# input_ids = torch.rand(
+#     size=(
+#         batch_size,
+#         sequence_length,
+#     )
+# )
+
+# pixel_values = torch.rand(
+#     size=(
+#         num_images * int(height / patch_size) * int(width / patch_size),
+#         num_channels * patch_size * patch_size * temporal_patch_size,
+#     )
+# )
+# image_grid_thw = torch.tensor([[num_images, int(height / patch_size), int(width / patch_size)]])
+
+
+# print("image_grid_thw", image_grid_thw)
+# print("pixel_values", pixel_values.shape)

From e5bf852f8ba04df0c11150cccdcc5fe9d3dd8909 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 20 Nov 2024 12:51:49 +0100
Subject: [PATCH 4/7] support idefics and idefics2

---
 examples/pytorch_vlm.yaml                     |   4 +-
 .../backends/transformers_utils.py            |  41 +--
 optimum_benchmark/generators/base.py          |  52 ++++
 .../generators/dataset_generator.py           |  38 ++-
 .../generators/input_generator.py             |  35 ++-
 .../generators/model_generator.py             | 259 ++++++++++++++++++
 .../generators/task_generator.py              | 167 ++---------
 .../scenarios/inference/scenario.py           |   5 +-
 8 files changed, 407 insertions(+), 194 deletions(-)
 create mode 100644 optimum_benchmark/generators/base.py
 create mode 100644 optimum_benchmark/generators/model_generator.py

diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml
index f11c4fcb3..c4bb786fe 100644
--- a/examples/pytorch_vlm.yaml
+++ b/examples/pytorch_vlm.yaml
@@ -17,7 +17,7 @@ backend:
   device_ids: 0
   no_weights: true
   torch_dtype: float16
-  model: Qwen/Qwen2-VL-7B-Instruct
+  model: HuggingFaceM4/idefics2-8b
 
 scenario:
   memory: true
@@ -32,7 +32,7 @@ scenario:
     batch_size: 1
     sequence_length: 256
     # image
-    num_images: 1
+    num_images: 2
     num_channels: 3
     height: 224
     width: 224
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 8ecbde01f..5c1b18dda 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -111,19 +111,22 @@ def get_flat_dict(d: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def get_flat_artifact_dict(artifact: Union[PretrainedConfig, PretrainedProcessor]) -> Dict[str, Any]:
-    if isinstance(artifact, ProcessorMixin):
-        artifact_dict = {}
+    artifact_dict = {}
 
+    if isinstance(artifact, ProcessorMixin):
+        artifact_dict.update(
+            {k: v for k, v in artifact.__dict__.items() if isinstance(v, (int, str, float, bool, list, tuple, dict))}
+        )
         for attribute in artifact.attributes:
-            artifact_dict.update(get_flat_artifact_dict(attribute))
+            artifact_dict.update(get_flat_artifact_dict(getattr(artifact, attribute)))
+    elif hasattr(artifact, "to_dict"):
+        artifact_dict.update(
+            {k: v for k, v in artifact.to_dict().items() if isinstance(v, (int, str, float, bool, list, tuple, dict))}
+        )
     else:
-        if hasattr(artifact, "to_dict"):
-            artifact_dict = {k: v for k, v in artifact.to_dict().items() if v is not None}
-        else:
-            try:
-                artifact_dict = {k: getattr(artifact, k) for k in dir(artifact) if getattr(artifact, k) is not None}
-            except Exception:
-                pass
+        artifact_dict.update(
+            {k: v for k, v in artifact.__dict__.items() if isinstance(v, (int, str, float, bool, list, tuple, dict))}
+        )
 
     artifact_dict = get_flat_dict(artifact_dict)
 
@@ -198,16 +201,22 @@ def extract_transformers_shapes_from_artifacts(
         shapes["num_queries"] = flat_artifacts_dict["num_queries"]
 
     # image-text input
-    if "image_token_id" in flat_artifacts_dict:
-        shapes["image_token_id"] = flat_artifacts_dict["image_token_id"]
-    if "in_chans" in flat_artifacts_dict:
-        shapes["num_channels"] = flat_artifacts_dict["in_chans"]
+
     if "patch_size" in flat_artifacts_dict:
         shapes["patch_size"] = flat_artifacts_dict["patch_size"]
-    if "temporal_patch_size" in flat_artifacts_dict:
-        shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"]
+    if "in_chans" in flat_artifacts_dict:
+        shapes["num_channels"] = flat_artifacts_dict["in_chans"]
+    if "image_seq_len" in flat_artifacts_dict:
+        shapes["image_seq_len"] = flat_artifacts_dict["image_seq_len"]
+    if "image_token_id" in flat_artifacts_dict:
+        shapes["image_token_id"] = flat_artifacts_dict["image_token_id"]
     if "spatial_merge_size" in flat_artifacts_dict:
         shapes["spatial_merge_size"] = flat_artifacts_dict["spatial_merge_size"]
+    if "do_image_splitting" in flat_artifacts_dict:
+        shapes["do_image_splitting"] = flat_artifacts_dict["do_image_splitting"]
+
+    if "temporal_patch_size" in flat_artifacts_dict:
+        shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"]
 
     return shapes
 
diff --git a/optimum_benchmark/generators/base.py b/optimum_benchmark/generators/base.py
new file mode 100644
index 000000000..e4d779b93
--- /dev/null
+++ b/optimum_benchmark/generators/base.py
@@ -0,0 +1,52 @@
+import logging
+import random
+import string
+from abc import ABC
+from typing import Dict, List, Tuple
+
+import torch
+
+LOGGER = logging.getLogger("generators")
+
+
+class BaseGenerator(ABC):
+    def __init__(self, shapes: Dict[str, int], with_labels: bool):
+        self.shapes = shapes
+        self.with_labels = with_labels
+
+    def assert_not_missing_shapes(self, required_shapes: List[str]):
+        for shape in required_shapes:
+            assert self.shapes.get(shape, None) is not None, (
+                f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. "
+                f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. "
+            )
+
+    @staticmethod
+    def generate_constant_integers(value: int, shape: Tuple[int]):
+        return torch.full(shape, value, dtype=torch.int64)
+
+    @staticmethod
+    def generate_constant_floats(value: float, shape: Tuple[int]):
+        return torch.full(shape, value, dtype=torch.float32)
+
+    @staticmethod
+    def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]):
+        return torch.randint(min_value, max_value, shape)
+
+    @staticmethod
+    def generate_random_floats(min_value: float, max_value: float, shape: Tuple[int]):
+        return torch.rand(shape) * (max_value - min_value) + min_value
+
+    @staticmethod
+    def generate_ranges(start: int, stop: int, shape: Tuple[int]):
+        return torch.arange(start, stop).repeat(shape[0], 1)
+
+    @staticmethod
+    def generate_random_strings(num_seq: int) -> List[str]:
+        return [
+            "".join(random.choice(string.ascii_letters + string.digits) for _ in range(random.randint(10, 100)))
+            for _ in range(num_seq)
+        ]
+
+    def __call__(self):
+        raise NotImplementedError("Generator must implement __call__ method")
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index 781e20d96..efc8a0294 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -1,29 +1,41 @@
-from typing import Dict
+from typing import Dict, Optional
 
 from datasets import Dataset
 
-from .task_generator import TASKS_TO_GENERATORS, TaskGenerator
+from .base import BaseGenerator
+from .model_generator import MODEL_TYPE_TO_GENERATORS
+from .task_generator import TASKS_TO_GENERATORS
 
 
 class DatasetGenerator:
-    task_generator: TaskGenerator
+    generator: BaseGenerator
 
-    def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None:
-        dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size", None)
+    def __init__(
+        self,
+        task: str,
+        dataset_shapes: Dict[str, int],
+        model_shapes: Dict[str, int],
+        model_type: Optional[str] = None,
+    ) -> None:
+        # dataset_shapes take precedence over model_shapes
+        all_shapes = {**model_shapes, **dataset_shapes}
+        all_shapes["batch_size"] = all_shapes.pop("dataset_size", None)
 
-        if task in TASKS_TO_GENERATORS:
-            all_shapes = {**model_shapes, **dataset_shapes}  # dataset_shapes take precedence over model_shapes
-            self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True)
+        if model_type in MODEL_TYPE_TO_GENERATORS:
+            self.generator = MODEL_TYPE_TO_GENERATORS[model_type](shapes=all_shapes, with_labels=True)
+        elif task in TASKS_TO_GENERATORS:
+            self.generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True)
         else:
             raise NotImplementedError(
-                f"Task {task} is supported. \n"
-                f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n"
-                "If you want to add support for this task, "
-                "please submit a PR or a feature request to optimum-benchmark. \n"
+                f"Task {task} is not supported for dataset generation. "
+                f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. "
+                f"Available model types: {list(MODEL_TYPE_TO_GENERATORS.keys())}. "
+                "If you want to add support for this task or model type, "
+                "please submit a PR or a feature request to optimum-benchmark."
             )
 
     def __call__(self) -> Dataset:
-        task_dataset = self.task_generator()
+        task_dataset = self.generator()
         task_dataset = Dataset.from_dict(task_dataset)
         task_dataset.set_format(type="torch", columns=list(task_dataset.features.keys()))
         return task_dataset
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index 10432fa95..2f05dc62f 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -1,23 +1,36 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
-from .task_generator import TASKS_TO_GENERATORS, TaskGenerator
+from .base import BaseGenerator
+from .model_generator import MODEL_TYPE_TO_GENERATORS
+from .task_generator import TASKS_TO_GENERATORS
 
 
 class InputGenerator:
-    task_generator: TaskGenerator
+    generator: BaseGenerator
 
-    def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None:
-        if task in TASKS_TO_GENERATORS:
-            all_shapes = {**model_shapes, **input_shapes} # input_shapes take precedence over model_shapes
-            self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False)
+    def __init__(
+        self,
+        task: str,
+        input_shapes: Dict[str, int],
+        model_shapes: Dict[str, int],
+        model_type: Optional[str] = None,
+    ) -> None:
+        # input_shapes take precedence over model_shapes
+        all_shapes = {**model_shapes, **input_shapes}
+
+        if model_type in MODEL_TYPE_TO_GENERATORS:
+            self.generator = MODEL_TYPE_TO_GENERATORS[model_type](shapes=all_shapes, with_labels=False)
+        elif task in TASKS_TO_GENERATORS:
+            self.generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False)
         else:
             raise NotImplementedError(
-                f"Task {task} is not supported. "
+                f"Task {task} is not supported for input generation. "
                 f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. "
-                "If you want to add support for this task, "
-                "please submit a PR or a feature request to optimum-benchmark. "
+                f"Available model types: {list(MODEL_TYPE_TO_GENERATORS.keys())}. "
+                "If you want to add support for this task or model type, "
+                "please submit a PR or a feature request to optimum-benchmark."
             )
 
     def __call__(self) -> Dict[str, Any]:
-        task_input = self.task_generator()
+        task_input = self.generator()
         return task_input
diff --git a/optimum_benchmark/generators/model_generator.py b/optimum_benchmark/generators/model_generator.py
new file mode 100644
index 000000000..711eeed14
--- /dev/null
+++ b/optimum_benchmark/generators/model_generator.py
@@ -0,0 +1,259 @@
+import logging
+
+import torch
+
+from .base import BaseGenerator
+
+LOGGER = logging.getLogger("generators")
+
+DEFAULT_VOCAB_SIZE = 2
+
+
+class IdeficsGenerator(BaseGenerator):
+    def input_ids(self):
+        self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "image_token_id"])
+
+        text_tokens = self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
+        )
+
+        image_tokens = self.generate_constant_integers(
+            value=self.shapes["image_token_id"],
+            shape=(self.shapes["batch_size"], self.shapes["num_images"]),
+        )
+
+        return torch.cat((text_tokens, image_tokens), dim=1)
+
+    def attention_mask(self):
+        self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images"])
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"] + self.shapes["num_images"],
+            ),
+        )
+
+    def pixel_values(self):
+        self.assert_not_missing_shapes(["batch_size", "num_images", "num_channels", "height", "width"])
+
+        return self.generate_random_floats(
+            min_value=0,
+            max_value=1,
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["num_images"],
+                self.shapes["num_channels"],
+                self.shapes["height"],
+                self.shapes["width"],
+            ),
+        )
+
+    def image_attention_mask(self):
+        self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images"])
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"] + self.shapes["num_images"],
+                self.shapes["num_images"],
+            ),
+        )
+
+    def __call__(self):
+        dummy = {}
+
+        dummy["input_ids"] = self.input_ids()
+        dummy["pixel_values"] = self.pixel_values()
+        dummy["attention_mask"] = self.attention_mask()
+        dummy["image_attention_mask"] = self.image_attention_mask()
+
+        if self.with_labels:
+            dummy["labels"] = self.input_ids()
+
+        return dummy
+
+
+class Idefics2Generator(BaseGenerator):
+    def input_ids(self):
+        self.assert_not_missing_shapes(
+            ["batch_size", "sequence_length", "num_images", "image_seq_len", "image_token_id", "do_image_splitting"]
+        )
+
+        text_tokens = self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
+        )
+
+        image_tokens = self.generate_constant_integers(
+            value=self.shapes["image_token_id"],
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["num_images"]
+                * self.shapes["image_seq_len"]
+                * (5 if self.shapes["do_image_splitting"] else 1),
+            ),
+        )
+
+        return torch.cat((text_tokens, image_tokens), dim=1)
+
+    def attention_mask(self):
+        self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "do_image_splitting"])
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"]
+                + self.shapes["num_images"]
+                * self.shapes["image_seq_len"]
+                * (5 if self.shapes["do_image_splitting"] else 1),
+            ),
+        )
+
+    def pixel_values(self):
+        self.assert_not_missing_shapes(
+            ["batch_size", "num_images", "num_channels", "height", "width", "do_image_splitting"]
+        )
+
+        return self.generate_random_floats(
+            min_value=0,
+            max_value=1,
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["num_images"] * (5 if self.shapes["do_image_splitting"] else 1),
+                self.shapes["num_channels"],
+                self.shapes["height"],
+                self.shapes["width"],
+            ),
+        )
+
+    def pixel_attention_mask(self):
+        self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "do_image_splitting"])
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["num_images"] * (5 if self.shapes["do_image_splitting"] else 1),
+                self.shapes["height"],
+                self.shapes["width"],
+            ),
+        )
+
+    def __call__(self):
+        dummy = {}
+
+        dummy["input_ids"] = self.input_ids()
+        dummy["pixel_values"] = self.pixel_values()
+        dummy["attention_mask"] = self.attention_mask()
+        dummy["pixel_attention_mask"] = self.pixel_attention_mask()
+
+        print("input_ids", dummy["input_ids"].shape)
+        print("pixel_values", dummy["pixel_values"].shape)
+        print("attention_mask", dummy["attention_mask"].shape)
+        print("pixel_attention_mask", dummy["pixel_attention_mask"].shape)
+
+        if self.with_labels:
+            dummy["labels"] = self.input_ids()
+
+        return dummy
+
+
+class Qwen2VLGenerator(BaseGenerator):
+    def input_ids(self):
+        self.assert_not_missing_shapes(
+            [
+                "batch_size",
+                "sequence_length",
+                "num_images",
+                "num_channels",
+                "height",
+                "width",
+                "patch_size",
+                "temporal_patch_size",
+                "spatial_merge_size",
+                "image_token_id",
+            ]
+        )
+
+        text_tokens = self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE),
+            shape=(
+                self.shapes["batch_size"],
+                self.shapes["sequence_length"],
+            ),
+        )
+        image_tokens = self.generate_constant_integers(
+            value=self.shapes["image_token_id"],
+            shape=(
+                self.shapes["batch_size"],
+                int(
+                    self.shapes["num_images"]
+                    * self.shapes["height"]
+                    * self.shapes["width"]
+                    / self.shapes["temporal_patch_size"]
+                    / self.shapes["spatial_merge_size"]
+                    / self.shapes["patch_size"] ** 2
+                ),
+            ),
+        )
+
+        return torch.cat((text_tokens, image_tokens), dim=1)
+
+    def pixel_values(self):
+        self.assert_not_missing_shapes(
+            ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"]
+        )
+
+        return self.generate_random_floats(
+            min_value=0,
+            max_value=1,
+            shape=(
+                self.shapes["num_images"]
+                * int(self.shapes["height"] / self.shapes["patch_size"])
+                * int(self.shapes["width"] / self.shapes["patch_size"]),
+                self.shapes["num_channels"]
+                * self.shapes["patch_size"]
+                * self.shapes["patch_size"]
+                * self.shapes["temporal_patch_size"],
+            ),
+        )
+
+    def image_grid_thw(self):
+        self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"])
+
+        return torch.tensor(
+            [
+                [
+                    self.shapes["num_images"],
+                    int(self.shapes["height"] / self.shapes["patch_size"]),
+                    int(self.shapes["width"] / self.shapes["patch_size"]),
+                ]
+            ]
+        )
+
+    def __call__(self):
+        dummy = {}
+
+        dummy["input_ids"] = self.input_ids()
+        dummy["pixel_values"] = self.pixel_values()
+        dummy["image_grid_thw"] = self.image_grid_thw()
+
+        if self.with_labels:
+            dummy["labels"] = self.input_ids()
+
+        return dummy
+
+
+MODEL_TYPE_TO_GENERATORS = {
+    "idefics": IdeficsGenerator,
+    "idefics2": Idefics2Generator,
+    "qwen2-vl": Qwen2VLGenerator,
+}
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index ccf89cdc1..c0f37d14e 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -1,11 +1,6 @@
 import logging
-import random
-import string
-from abc import ABC
-from typing import Dict, List, Tuple
 
-# TODO: drop torch dependency and use numpy instead
-import torch
+from .base import BaseGenerator
 
 LOGGER = logging.getLogger("generators")
 
@@ -14,56 +9,13 @@
 DEFAULT_TYPE_VOCAB_SIZE = 2
 
 
-class TaskGenerator(ABC):
-    def __init__(self, shapes: Dict[str, int], with_labels: bool):
-        self.shapes = shapes
-        self.with_labels = with_labels
-
-    def assert_not_missing_shapes(self, required_shapes: List[str]):
-        for shape in required_shapes:
-            assert self.shapes.get(shape, None) is not None, (
-                f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. "
-                f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. "
-            )
-
-    @staticmethod
-    def generate_constant_integers(value: int, shape: Tuple[int]):
-        return torch.full(shape, value, dtype=torch.int64)
-
-    @staticmethod
-    def generate_constant_floats(value: float, shape: Tuple[int]):
-        return torch.full(shape, value, dtype=torch.float32)
-
-    @staticmethod
-    def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]):
-        return torch.randint(min_value, max_value, shape)
-
-    @staticmethod
-    def generate_random_floats(min_value: float, max_value: float, shape: Tuple[int]):
-        return torch.rand(shape) * (max_value - min_value) + min_value
-
-    @staticmethod
-    def generate_ranges(start: int, stop: int, shape: Tuple[int]):
-        return torch.arange(start, stop).repeat(shape[0], 1)
-
-    @staticmethod
-    def generate_random_strings(num_seq: int) -> List[str]:
-        return [
-            "".join(random.choice(string.ascii_letters + string.digits) for _ in range(random.randint(10, 100)))
-            for _ in range(num_seq)
-        ]
-
-    def __call__(self):
-        raise NotImplementedError("Generator must implement __call__ method")
-
-
-class TextGenerator(TaskGenerator):
+class TextGenerator(BaseGenerator):
     def input_ids(self):
         self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE,
+            max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE),
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
@@ -80,7 +32,7 @@ def token_type_ids(self):
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes.get("type_vocab_size", None) or DEFAULT_TYPE_VOCAB_SIZE,
+            max_value=self.shapes.get("type_vocab_size", DEFAULT_TYPE_VOCAB_SIZE),
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
@@ -102,7 +54,7 @@ def requires_position_ids(self):
         )
 
 
-class ImageGenerator(TaskGenerator):
+class ImageGenerator(BaseGenerator):
     def pixel_values(self):
         self.assert_not_missing_shapes(["batch_size", "num_channels", "height", "width"])
 
@@ -118,7 +70,7 @@ def pixel_values(self):
         )
 
 
-class AudioGenerator(TaskGenerator):
+class AudioGenerator(BaseGenerator):
     def input_values(self):
         self.assert_not_missing_shapes(["batch_size", "sequence_length"])
 
@@ -151,7 +103,7 @@ def labels(self):
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS),
             shape=(self.shapes["batch_size"],),
         )
 
@@ -179,7 +131,7 @@ def labels(self):
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS),
             shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
@@ -319,7 +271,7 @@ def labels(self):
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS),
             shape=(self.shapes["batch_size"],),
         )
 
@@ -341,7 +293,7 @@ def labels(self):
             {
                 "class_labels": self.generate_random_integers(
                     min_value=0,
-                    max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+                    max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS),
                     shape=(self.shapes["num_queries"],),
                 ),
                 "boxes": self.generate_random_floats(min_value=-1, max_value=1, shape=(self.shapes["num_queries"], 4)),
@@ -365,7 +317,7 @@ def labels(self):
 
         return self.generate_random_integers(
             min_value=0,
-            max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS,
+            max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS),
             shape=(self.shapes["batch_size"], self.shapes["height"], self.shapes["width"]),
         )
 
@@ -384,7 +336,7 @@ def labels(self):
         self.assert_not_missing_shapes(["batch_size"])
 
         return self.generate_random_integers(
-            min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],)
+            min_value=0, max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -417,7 +369,7 @@ def __call__(self):
         return dummy
 
 
-class PromptGenerator(TaskGenerator):
+class PromptGenerator(BaseGenerator):
     def prompt(self):
         self.assert_not_missing_shapes(["batch_size"])
 
@@ -434,9 +386,7 @@ class FeatureExtractionGenerator(TextGenerator, ImageGenerator):
     def __call__(self):
         dummy = {}
 
-        if self.shapes.get("num_channels", None) is not None and self.shapes.get("height", None) is not None:
-            dummy["pixel_values"] = self.pixel_values()
-        else:
+        if self.shapes.get("sequence_length", None) is not None:
             dummy["input_ids"] = self.input_ids()
             dummy["attention_mask"] = self.attention_mask()
 
@@ -446,92 +396,8 @@ def __call__(self):
             if self.requires_position_ids():
                 dummy["position_ids"] = self.position_ids()
 
-        return dummy
-
-
-class ImageTextToTextGenerationGenerator(TaskGenerator):
-    def input_ids(self):
-        self.assert_not_missing_shapes(
-            [
-                "batch_size",
-                "sequence_length",
-                "num_images",
-                "num_channels",
-                "height",
-                "width",
-                "patch_size",
-                "temporal_patch_size",
-                "spatial_merge_size",
-                "image_token_id",
-            ]
-        )
-
-        text_tokens = self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE,
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
-        )
-        image_tokens = self.generate_constant_integers(
-            value=self.shapes["image_token_id"],
-            shape=(
-                self.shapes["batch_size"],
-                int(
-                    self.shapes["num_images"]
-                    * self.shapes["height"]
-                    * self.shapes["width"]
-                    / self.shapes["temporal_patch_size"]
-                    / self.shapes["spatial_merge_size"]
-                    / self.shapes["patch_size"] ** 2
-                ),
-            ),
-        )
-
-        return torch.cat((text_tokens, image_tokens), dim=1)
-
-    def pixel_values(self):
-        self.assert_not_missing_shapes(
-            ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"]
-        )
-
-        return self.generate_random_floats(
-            min_value=0,
-            max_value=1,
-            shape=(
-                self.shapes["num_images"]
-                * int(self.shapes["height"] / self.shapes["patch_size"])
-                * int(self.shapes["width"] / self.shapes["patch_size"]),
-                self.shapes["num_channels"]
-                * self.shapes["patch_size"]
-                * self.shapes["patch_size"]
-                * self.shapes["temporal_patch_size"],
-            ),
-        )
-
-    def image_grid_thw(self):
-        self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"])
-
-        return torch.tensor(
-            [
-                [
-                    self.shapes["num_images"],
-                    int(self.shapes["height"] / self.shapes["patch_size"]),
-                    int(self.shapes["width"] / self.shapes["patch_size"]),
-                ]
-            ]
-        )
-
-    def __call__(self):
-        dummy = {}
-
-        dummy["input_ids"] = self.input_ids()
-        dummy["pixel_values"] = self.pixel_values()
-        dummy["image_grid_thw"] = self.image_grid_thw()
-
-        if self.with_labels:
-            dummy["labels"] = self.input_ids()
+        if self.shapes.get("height", None) is not None:
+            dummy["pixel_values"] = self.pixel_values()
 
         return dummy
 
@@ -549,7 +415,6 @@ def __call__(self):
     "image-classification": ImageClassificationGenerator,
     "object-detection": ObjectDetectionGenerator,
     "semantic-segmentation": SemanticSegmentationGenerator,
-    "image-text-to-text": ImageTextToTextGenerationGenerator,
     # diffusers pipelines tasks
     "text-to-image": PromptGenerator,
     "stable-diffusion": PromptGenerator,
diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py
index 8b3bb1b76..28182adb5 100644
--- a/optimum_benchmark/scenarios/inference/scenario.py
+++ b/optimum_benchmark/scenarios/inference/scenario.py
@@ -60,7 +60,10 @@ def __init__(self, config: InferenceConfig) -> None:
     def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
         self.logger.info("\t+ Creating input generator")
         self.input_generator = InputGenerator(
-            task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes
+            task=backend.config.task,
+            input_shapes=self.config.input_shapes,
+            model_shapes=backend.model_shapes,
+            model_type=backend.config.model_type,
         )
 
         if backend.config.task in TEXT_GENERATION_TASKS:

From 44caa158f12f4302e6f0709099129893eb1a833f Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 20 Nov 2024 12:57:39 +0100
Subject: [PATCH 5/7] remove file

---
 test.py | 53 -----------------------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 6c96b0581..000000000
--- a/test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from transformers import AutoProcessor, Idefics2Processor
-
-processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics-9b")
-print(processor.to_dict())
-
-# dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
-# dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"
-
-# prompts = [
-#     [
-#         "User:",
-#         dogs_image_url_1,
-#         "Describe this image.\nAssistant: An image of two dogs.\n",
-#         "User:",
-#         dogs_image_url_2,
-#         "Describe this image.\nAssistant:",
-#     ]
-# ]
-
-# inputs = processor(prompts, return_tensors="pt")
-
-# print("inputs_ids", inputs["input_ids"].shape)
-# print("pixel_values", inputs["pixel_values"].shape)
-
-# batch_size = 1
-# sequence_length = 128
-
-# num_images = 1
-# num_channels = 3
-# height = 224
-# width = 224
-
-# patch_size = 14
-# temporal_patch_size = 2
-
-# input_ids = torch.rand(
-#     size=(
-#         batch_size,
-#         sequence_length,
-#     )
-# )
-
-# pixel_values = torch.rand(
-#     size=(
-#         num_images * int(height / patch_size) * int(width / patch_size),
-#         num_channels * patch_size * patch_size * temporal_patch_size,
-#     )
-# )
-# image_grid_thw = torch.tensor([[num_images, int(height / patch_size), int(width / patch_size)]])
-
-
-# print("image_grid_thw", image_grid_thw)
-# print("pixel_values", pixel_values.shape)

From 2248f8e319c103af1351319c2fec5c98dfbb25f2 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 20 Nov 2024 13:36:45 +0100
Subject: [PATCH 6/7] support generic image-text-to-text as well (blip, blip2,
 ..)

---
 examples/pytorch_vlm.yaml                         |  2 +-
 optimum_benchmark/backends/transformers_utils.py  |  2 +-
 optimum_benchmark/generators/model_generator.py   |  2 +-
 optimum_benchmark/generators/task_generator.py    | 15 +++++++++++++++
 optimum_benchmark/scenarios/inference/scenario.py |  2 --
 tests/configs/_image_text_to_text_.yaml           |  7 +++++++
 .../cpu_inference_pytorch_image_text_to_text.yaml | 11 +++++++++++
 7 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 tests/configs/_image_text_to_text_.yaml
 create mode 100644 tests/configs/cpu_inference_pytorch_image_text_to_text.yaml

diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml
index c4bb786fe..a39f8c8aa 100644
--- a/examples/pytorch_vlm.yaml
+++ b/examples/pytorch_vlm.yaml
@@ -17,7 +17,7 @@ backend:
   device_ids: 0
   no_weights: true
   torch_dtype: float16
-  model: HuggingFaceM4/idefics2-8b
+  model: Qwen/Qwen2-VL-7B-Instruct
 
 scenario:
   memory: true
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 5c1b18dda..7e39c9294 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -161,7 +161,7 @@ def extract_transformers_shapes_from_artifacts(
 
     # image input
     if "num_channels" in flat_artifacts_dict:
-        shapes["num_channels"] = flat_artifacts_dict.get("channels", None)
+        shapes["num_channels"] = flat_artifacts_dict["num_channels"]
 
     if "image_size" in flat_artifacts_dict:
         image_size = flat_artifacts_dict["image_size"]
diff --git a/optimum_benchmark/generators/model_generator.py b/optimum_benchmark/generators/model_generator.py
index 711eeed14..e709398a7 100644
--- a/optimum_benchmark/generators/model_generator.py
+++ b/optimum_benchmark/generators/model_generator.py
@@ -255,5 +255,5 @@ def __call__(self):
 MODEL_TYPE_TO_GENERATORS = {
     "idefics": IdeficsGenerator,
     "idefics2": Idefics2Generator,
-    "qwen2-vl": Qwen2VLGenerator,
+    "qwen2_vl": Qwen2VLGenerator,
 }
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index c0f37d14e..9f6834d38 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -402,6 +402,20 @@ def __call__(self):
         return dummy
 
 
+class ImageTextToTextGenerator(TextGenerator, ImageGenerator):
+    def __call__(self):
+        dummy = {}
+
+        dummy["input_ids"] = self.input_ids()
+        dummy["attention_mask"] = self.attention_mask()
+        dummy["pixel_values"] = self.pixel_values()
+
+        if self.with_labels:
+            dummy["labels"] = self.input_ids()
+
+        return dummy
+
+
 TASKS_TO_GENERATORS = {
     # transformers models tasks
     "feature-extraction": FeatureExtractionGenerator,
@@ -415,6 +429,7 @@ def __call__(self):
     "image-classification": ImageClassificationGenerator,
     "object-detection": ObjectDetectionGenerator,
     "semantic-segmentation": SemanticSegmentationGenerator,
+    "image-text-to-text": ImageTextToTextGenerator,
     # diffusers pipelines tasks
     "text-to-image": PromptGenerator,
     "stable-diffusion": PromptGenerator,
diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py
index 28182adb5..512f269df 100644
--- a/optimum_benchmark/scenarios/inference/scenario.py
+++ b/optimum_benchmark/scenarios/inference/scenario.py
@@ -21,8 +21,6 @@
     "min_new_tokens": 100,
     "do_sample": False,
     "use_cache": True,
-    "pad_token_id": 0,
-    "eos_token_id": 0,
     "num_beams": 1,
 }
 TEXT_GENERATION_PREFILL_OVERRIDES = {
diff --git a/tests/configs/_image_text_to_text_.yaml b/tests/configs/_image_text_to_text_.yaml
new file mode 100644
index 000000000..aa8357f75
--- /dev/null
+++ b/tests/configs/_image_text_to_text_.yaml
@@ -0,0 +1,7 @@
+hydra:
+  mode: MULTIRUN
+  sweeper:
+    params:
+      backend.task: image-text-to-text
+      backend.model: hf-internal-testing/tiny-random-BlipForConditionalGeneration,hf-internal-testing/tiny-random-Blip2ForConditionalGeneration,hf-internal-testing/tiny-random-IdeficsForVisionText2Text
+      +scenario.input_shapes.num_images: 2
diff --git a/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml b/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml
new file mode 100644
index 000000000..df125a3ac
--- /dev/null
+++ b/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml
@@ -0,0 +1,11 @@
+defaults:
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _cpu_ # inherits from cpu config
+  - _inference_ # inherits from inference config
+  - _image_text_to_text_ # inherits from image text to text config
+  - _no_weights_ # inherits from no weights config
+  - _self_ # hydra 1.1 compatibility
+  - override backend: pytorch
+
+name: cpu_inference_pytorch_image_text_to_text

From 62746cc71e349121e6cbdbb90ed4a20223be0253 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 21 Nov 2024 14:22:57 +0100
Subject: [PATCH 7/7] num_choices in tests

---
 .../backends/transformers_utils.py            |  7 ++-
 .../generators/task_generator.py              | 45 ++++++++++++-------
 tests/configs/_image_text_to_text_.yaml       |  5 ++-
 tests/test_api.py                             | 17 ++++++-
 4 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 7e39c9294..2212cd5ff 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -62,8 +62,11 @@
             model_loaders = (model_loaders,)
 
         for model_loader_name in model_loaders:
-            model_loader_class = getattr(transformers, model_loader_name)
-            TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update(model_loader_class._model_mapping._model_mapping)
+            model_loader_class = getattr(transformers, model_loader_name, None)
+            if model_loader_class is not None:
+                TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update(
+                    model_loader_class._model_mapping._model_mapping
+                )
 else:
     TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
 
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 9f6834d38..f11d21eb0 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -230,6 +230,32 @@ def __call__(self):
 
 
 class MultipleChoiceGenerator(TextGenerator):
+    def input_ids(self):
+        self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"])
+
+        return self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE),
+            shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]),
+        )
+
+    def attention_mask(self):
+        self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"])
+
+        return self.generate_constant_integers(
+            value=1,  # no sparsity
+            shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]),
+        )
+
+    def token_type_ids(self):
+        self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"])
+
+        return self.generate_random_integers(
+            min_value=0,
+            max_value=self.shapes.get("type_vocab_size", DEFAULT_TYPE_VOCAB_SIZE),
+            shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]),
+        )
+
     def labels(self):
         self.assert_not_missing_shapes(["batch_size", "num_choices"])
 
@@ -240,24 +266,11 @@ def labels(self):
     def __call__(self):
         dummy = {}
 
-        dummy["input_ids"] = (
-            self.input_ids()
-            .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"])
-            .repeat(1, self.shapes["num_choices"], 1)
-        )
-
-        dummy["attention_mask"] = (
-            self.attention_mask()
-            .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"])
-            .repeat(1, self.shapes["num_choices"], 1)
-        )
+        dummy["input_ids"] = self.input_ids()
+        dummy["attention_mask"] = self.attention_mask()
 
         if self.requires_token_type_ids():
-            dummy["token_type_ids"] = (
-                self.token_type_ids()
-                .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"])
-                .repeat(1, self.shapes["num_choices"], 1)
-            )
+            dummy["token_type_ids"] = self.token_type_ids()
 
         if self.with_labels:
             dummy["label"] = self.labels()
diff --git a/tests/configs/_image_text_to_text_.yaml b/tests/configs/_image_text_to_text_.yaml
index aa8357f75..20043a674 100644
--- a/tests/configs/_image_text_to_text_.yaml
+++ b/tests/configs/_image_text_to_text_.yaml
@@ -3,5 +3,8 @@ hydra:
   sweeper:
     params:
       backend.task: image-text-to-text
-      backend.model: hf-internal-testing/tiny-random-BlipForConditionalGeneration,hf-internal-testing/tiny-random-Blip2ForConditionalGeneration,hf-internal-testing/tiny-random-IdeficsForVisionText2Text
+      backend.model: hf-internal-testing/tiny-random-GitForCausalLM,
+        hf-internal-testing/tiny-random-BlipForConditionalGeneration,
+        hf-internal-testing/tiny-random-Blip2ForConditionalGeneration,
+        hf-internal-testing/tiny-random-IdeficsForVisionText2Text
       +scenario.input_shapes.num_images: 2
diff --git a/tests/test_api.py b/tests/test_api.py
index 66ee16f95..fd6e2dac1 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -47,6 +47,9 @@
 def test_api_launch(device, scenario, library, task, model):
     benchmark_name = f"{device}_{scenario}_{library}_{task}_{model}"
 
+    if task == "multiple-choice":
+        INPUT_SHAPES["num_choices"] = 2
+
     if device == "cuda":
         device_isolation = True
         if is_rocm_system():
@@ -82,7 +85,7 @@ def test_api_launch(device, scenario, library, task, model):
             duration=1,
             iterations=1,
             warmup_runs=1,
-            input_shapes={"batch_size": 1, "sequence_length": 2},
+            input_shapes=INPUT_SHAPES,
             generate_kwargs={"max_new_tokens": 2, "min_new_tokens": 2},
             call_kwargs={"num_inference_steps": 2},
         )
@@ -170,7 +173,14 @@ def test_api_input_generator(library, task, model):
     else:
         raise ValueError(f"Unknown library {library}")
 
-    input_generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes)
+    if task == "multiple-choice":
+        INPUT_SHAPES["num_choices"] = 2
+
+    input_generator = InputGenerator(
+        task=task,
+        input_shapes=INPUT_SHAPES,
+        model_shapes=model_shapes,
+    )
     generated_inputs = input_generator()
 
     assert len(generated_inputs) > 0, "No inputs were generated"
@@ -193,6 +203,9 @@ def test_api_dataset_generator(library, task, model):
     else:
         raise ValueError(f"Unknown library {library}")
 
+    if task == "multiple-choice":
+        DATASET_SHAPES["num_choices"] = 2
+
     generator = DatasetGenerator(task=task, dataset_shapes=DATASET_SHAPES, model_shapes=model_shapes)
     generated_dataset = generator()