From f2b288a9f6526d8d5f2488fd987fdac73edee9f0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 19 Nov 2024 16:50:53 +0100 Subject: [PATCH 1/7] initial support --- examples/pytorch_vlm.yaml | 42 +++ optimum_benchmark/backends/timm_utils.py | 33 +- .../backends/transformers_utils.py | 66 ++-- .../generators/task_generator.py | 354 +++++++++++++++++- optimum_benchmark/task_utils.py | 1 + 5 files changed, 441 insertions(+), 55 deletions(-) create mode 100644 examples/pytorch_vlm.yaml diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml new file mode 100644 index 000000000..f11c4fcb3 --- /dev/null +++ b/examples/pytorch_vlm.yaml @@ -0,0 +1,42 @@ +defaults: + - benchmark + - scenario: inference + - launcher: process + - backend: pytorch + - _base_ + - _self_ + +name: pytorch_vlm + +launcher: + device_isolation: true + device_isolation_action: warn + +backend: + device: cuda + device_ids: 0 + no_weights: true + torch_dtype: float16 + model: Qwen/Qwen2-VL-7B-Instruct + +scenario: + memory: true + latency: true + + warmup_runs: 10 + iterations: 10 + duration: 10 + + input_shapes: + # text + batch_size: 1 + sequence_length: 256 + # image + num_images: 1 + num_channels: 3 + height: 224 + width: 224 + + generate_kwargs: + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index 941e09917..d7b11b1dc 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -35,15 +35,17 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: shapes = {} # image input - shapes["num_channels"] = artifacts_dict.get("num_channels", None) - if shapes["num_channels"] is None: - # processors have different names for the number of channels + if "num_channels" in artifacts_dict: + shapes["num_channels"] = artifacts_dict.get("num_channels", None) + elif "channels" in artifacts_dict: shapes["num_channels"] = artifacts_dict.get("channels", None) - image_size = artifacts_dict.get("image_size", None) - if image_size is None: - # processors have different names for the image size - image_size = artifacts_dict.get("size", None) + if "image_size" in artifacts_dict: + image_size = artifacts_dict["image_size"] + elif "size" in artifacts_dict: + image_size = artifacts_dict["size"] + else: + image_size = None if isinstance(image_size, (int, float)): shapes["height"] = image_size @@ -57,24 +59,19 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: elif isinstance(image_size, dict) and len(image_size) == 1: shapes["height"] = list(image_size.values())[0] shapes["width"] = list(image_size.values())[0] - else: - shapes["height"] = None - shapes["width"] = None - input_size = artifacts_dict.get("input_size", None) - if input_size is not None: + if "input_size" in artifacts_dict: + input_size = artifacts_dict.get("input_size", None) shapes["num_channels"] = input_size[0] shapes["height"] = input_size[1] shapes["width"] = input_size[2] # classification labels - id2label = artifacts_dict.get("id2label", None) - if id2label is not None: + if "id2label" in artifacts_dict: + id2label = artifacts_dict["id2label"] shapes["num_labels"] = len(id2label) - - num_classes = artifacts_dict.get("num_classes", None) - if num_classes is not None: - shapes["num_labels"] = num_classes + elif "num_classes" in artifacts_dict: + shapes["num_labels"] = artifacts_dict["num_classes"] return shapes diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 3c7ecdcd6..009c53688 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -47,6 +47,7 @@ "image-to-text": "AutoModelForVision2Seq", "text-generation": "AutoModelForCausalLM", "text2text-generation": "AutoModelForSeq2SeqLM", + "image-text-to-text": "AutoModelForImageTextToText", "visual-question-answering": "AutoModelForVisualQuestionAnswering", "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"), } @@ -125,22 +126,27 @@ def extract_transformers_shapes_from_artifacts( shapes = {} # text input - shapes["vocab_size"] = artifacts_dict.get("vocab_size", None) - shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", None) - shapes["max_position_embeddings"] = artifacts_dict.get("max_position_embeddings", None) - if shapes["max_position_embeddings"] is None: - shapes["max_position_embeddings"] = artifacts_dict.get("n_positions", None) + if "vocab_size" in artifacts_dict: + shapes["vocab_size"] = artifacts_dict["vocab_size"] + + if "type_vocab_size" in artifacts_dict: + shapes["type_vocab_size"] = artifacts_dict["type_vocab_size"] + + if "max_position_embeddings" in artifacts_dict: + shapes["max_position_embeddings"] = artifacts_dict["max_position_embeddings"] + elif "n_positions" in artifacts_dict: + shapes["max_position_embeddings"] = artifacts_dict["n_positions"] # image input - shapes["num_channels"] = artifacts_dict.get("num_channels", None) - if shapes["num_channels"] is None: - # processors have different names for the number of channels + if "num_channels" in artifacts_dict: shapes["num_channels"] = artifacts_dict.get("channels", None) - image_size = artifacts_dict.get("image_size", None) - if image_size is None: - # processors have different names for the image size - image_size = artifacts_dict.get("size", None) + if "image_size" in artifacts_dict: + image_size = artifacts_dict["image_size"] + elif "size" in artifacts_dict: + image_size = artifacts_dict["size"] + else: + image_size = None if isinstance(image_size, (int, float)): shapes["height"] = image_size @@ -154,29 +160,37 @@ def extract_transformers_shapes_from_artifacts( elif isinstance(image_size, dict) and len(image_size) == 1: shapes["height"] = list(image_size.values())[0] shapes["width"] = list(image_size.values())[0] - else: - shapes["height"] = None - shapes["width"] = None - input_size = artifacts_dict.get("input_size", None) - if input_size is not None: + if "input_size" in artifacts_dict: + input_size = artifacts_dict["input_size"] shapes["num_channels"] = input_size[0] shapes["height"] = input_size[1] shapes["width"] = input_size[2] # classification labels - id2label = artifacts_dict.get("id2label", None) - if id2label is not None: + if "id2label" in artifacts_dict: + id2label = artifacts_dict["id2label"] shapes["num_labels"] = len(id2label) - - num_classes = artifacts_dict.get("num_classes", None) - if num_classes is not None: - shapes["num_labels"] = num_classes + elif "num_classes" in artifacts_dict: + shapes["num_labels"] = artifacts_dict["num_classes"] # object detection labels - shapes["num_queries"] = artifacts_dict.get("num_queries", None) - if shapes["num_queries"] == 0: - shapes["num_queries"] = 2 + if "num_queries" in artifacts_dict: + shapes["num_queries"] = artifacts_dict["num_queries"] + + # image-text input + if "image_token_id" in artifacts_dict: + shapes["image_token_id"] = artifacts_dict["image_token_id"] + + if "vision_config" in artifacts_dict: + if "in_chans" in artifacts_dict["vision_config"]: + shapes["num_channels"] = artifacts_dict["vision_config"]["in_chans"] + if "patch_size" in artifacts_dict["vision_config"]: + shapes["patch_size"] = artifacts_dict["vision_config"]["patch_size"] + if "temporal_patch_size" in artifacts_dict["vision_config"]: + shapes["temporal_patch_size"] = artifacts_dict["vision_config"]["temporal_patch_size"] + if "spatial_merge_size" in artifacts_dict["vision_config"]: + shapes["spatial_merge_size"] = artifacts_dict["vision_config"]["spatial_merge_size"] return shapes diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 761315780..411130355 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -2,7 +2,7 @@ import random import string from abc import ABC -from typing import List, Tuple +from typing import Dict, List, Tuple # TODO: drop torch dependency and use numpy instead import torch @@ -15,10 +15,18 @@ class TaskGenerator(ABC): - def __init__(self, shapes, with_labels: bool): + def __init__(self, shapes: Dict[str, int], with_labels: bool): self.shapes = shapes self.with_labels = with_labels + @staticmethod + def generate_constant_integers(value: int, shape: Tuple[int]): + return torch.full(shape, value, dtype=torch.int64) + + @staticmethod + def generate_constant_floats(value: float, shape: Tuple[int]): + return torch.full(shape, value, dtype=torch.float32) + @staticmethod def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]): return torch.randint(min_value, max_value, shape) @@ -44,27 +52,62 @@ def __call__(self): class TextGenerator(TaskGenerator): def input_ids(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate input ids." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids." + ) + return self.generate_random_integers( min_value=0, - max_value=self.shapes["vocab_size"] or DEFAULT_VOCAB_SIZE, + max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE, shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def attention_mask(self): - return self.generate_random_integers( - min_value=1, # avoid sparse attention - max_value=2, + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate attention masks." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate attention masks." + ) + + return self.generate_constant_integers( + value=1, # no sparsity shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def token_type_ids(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate token type ids." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate token type ids." + ) + return self.generate_random_integers( min_value=0, - max_value=self.shapes["type_vocab_size"] or DEFAULT_TYPE_VOCAB_SIZE, + max_value=self.shapes.get("type_vocab_size", None) or DEFAULT_TYPE_VOCAB_SIZE, shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def position_ids(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate position ids." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate position ids." + ) + return self.generate_ranges( start=0, stop=self.shapes["sequence_length"], @@ -72,14 +115,33 @@ def position_ids(self): ) def requires_token_type_ids(self): - return self.shapes["type_vocab_size"] is not None and self.shapes["type_vocab_size"] > 1 + return self.shapes.get("type_vocab_size", None) is not None and self.shapes["type_vocab_size"] > 1 def requires_position_ids(self): - return self.shapes["max_position_embeddings"] is not None + return ( + self.shapes.get("max_position_embeddings", None) is not None and self.shapes["max_position_embeddings"] > 1 + ) class ImageGenerator(TaskGenerator): def pixel_values(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate pixel values." + ) + assert self.shapes.get("num_channels", None) is not None, ( + "Number of channels couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values." + ) + assert self.shapes.get("height", None) is not None, ( + "Height couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `height` to be able to generate pixel values." + ) + assert self.shapes.get("width", None) is not None, ( + "Width couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `width` to be able to generate pixel values." + ) + return self.generate_random_floats( min_value=0, max_value=1, @@ -89,11 +151,32 @@ def pixel_values(self): class AudioGenerator(TaskGenerator): def input_values(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate input values." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate input values." + ) return self.generate_random_floats( min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"]) ) def input_features(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate input features." + ) + assert self.shapes.get("feature_size", None) is not None, ( + "Feature size couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `feature_size` to be able to generate input features." + ) + assert self.shapes.get("nb_max_frames", None) is not None, ( + "Number of max frames couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `nb_max_frames` to be able to generate input features." + ) + return self.generate_random_floats( min_value=-1, max_value=1, @@ -103,8 +186,15 @@ def input_features(self): class TextClassificationGenerator(TextGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + return self.generate_random_integers( - min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],) + min_value=0, + max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + shape=(self.shapes["batch_size"],), ) def __call__(self): @@ -127,6 +217,15 @@ def __call__(self): class TokenClassificationGenerator(TextGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate labels." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, @@ -177,11 +276,29 @@ def __call__(self): class QuestionAnsweringGenerator(TextGenerator): def start_positions(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate start positions." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate start positions." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) ) def end_positions(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate end positions." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate end positions." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) ) @@ -221,6 +338,15 @@ def __call__(self): class MultipleChoiceGenerator(TextGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("num_choices", None) is not None, ( + "Number of choices must be provided, " + "please provide it in `input_shapes` as `num_choices` to be able to generate labels." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],) ) @@ -255,8 +381,15 @@ def __call__(self): class ImageClassificationGenerator(ImageGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + return self.generate_random_integers( - min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],) + min_value=0, + max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + shape=(self.shapes["batch_size"],), ) def __call__(self): @@ -271,6 +404,15 @@ def __call__(self): class ObjectDetectionGenerator(ImageGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("num_queries", None) is not None, ( + "Number of queries must be provided, " + "please provide it in `input_shapes` as `num_queries` to be able to generate labels." + ) + return [ { "class_labels": self.generate_random_integers( @@ -295,6 +437,19 @@ def __call__(self): class SemanticSegmentationGenerator(ImageGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("height", None) is not None, ( + "Height couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `height` to be able to generate labels." + ) + assert self.shapes.get("width", None) is not None, ( + "Width couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `width` to be able to generate labels." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, @@ -313,6 +468,15 @@ def __call__(self): class AudioClassificationGenerator(AudioGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("num_labels", None) is not None, ( + "Number of labels couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `num_labels` to be able to generate labels." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],) ) @@ -329,6 +493,19 @@ def __call__(self): class AutomaticSpeechRecognitionGenerator(AudioGenerator): def labels(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate labels." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate labels." + ) + assert self.shapes.get("num_labels", None) is not None, ( + "Number of labels couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `num_labels` to be able to generate labels." + ) + return self.generate_random_integers( min_value=0, max_value=self.shapes["vocab_size"] or DEFAULT_TYPE_VOCAB_SIZE, @@ -347,6 +524,11 @@ def __call__(self): class PromptGenerator(TaskGenerator): def prompt(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate prompts." + ) + return self.generate_random_strings(num_seq=self.shapes["batch_size"]) def __call__(self): @@ -375,6 +557,155 @@ def __call__(self): return dummy +class ImageTextToTextGenerationGenerator(TextGenerator, ImageGenerator): + def input_ids(self): + assert self.shapes.get("batch_size", None) is not None, ( + "Batch size must be provided, " + "please provide it in `input_shapes` as `batch_size` to be able to generate input ids." + ) + assert self.shapes.get("sequence_length", None) is not None, ( + "Sequence length must be provided, " + "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids." + ) + assert self.shapes.get("num_images", None) is not None, ( + "Number of images must be provided, " + "please provide it in `input_shapes` as `num_images` to be able to generate input ids." + ) + assert self.shapes.get("num_channels", None) is not None, ( + "Number of channels couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `num_channels` to be able to generate input ids." + ) + assert self.shapes.get("height", None) is not None, ( + "Height couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `height` to be able to generate input ids." + ) + assert self.shapes.get("width", None) is not None, ( + "Width couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `width` to be able to generate input ids." + ) + assert self.shapes.get("patch_size", None) is not None, ( + "Patch size must be provided, " + "please provide it in `input_shapes` as `patch_size` to be able to generate input ids." + ) + assert self.shapes.get("temporal_patch_size", None) is not None, ( + "Temporal patch size must be provided, " + "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate input ids." + ) + assert self.shapes.get("spatial_merge_size", None) is not None, ( + "Spatial merge size must be provided, " + "please provide it in `input_shapes` as `spatial_merge_size` to be able to generate input ids." + ) + assert self.shapes.get("image_token_id", None) is not None, ( + "Image token id must be provided, " + "please provide it in `input_shapes` as `image_token_id` to be able to generate input ids." + ) + + text_tokens = self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE, + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"], + ), + ) + image_tokens = self.generate_constant_integers( + value=self.shapes["image_token_id"], + shape=( + self.shapes["batch_size"], + int( + self.shapes["num_images"] + * self.shapes["height"] + * self.shapes["width"] + / self.shapes["temporal_patch_size"] + / self.shapes["spatial_merge_size"] + / self.shapes["patch_size"] ** 2 + ), + ), + ) + + return torch.cat((text_tokens, image_tokens), dim=1) + + def pixel_values(self): + assert self.shapes.get("num_images", None) is not None, ( + "Number of images must be provided, " + "please provide it in `input_shapes` as `num_images` to be able to generate pixel values." + ) + assert self.shapes.get("num_channels", None) is not None, ( + "Number of channels couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values." + ) + assert self.shapes.get("height", None) is not None, ( + "Height couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `height` to be able to generate pixel values." + ) + assert self.shapes.get("width", None) is not None, ( + "Width couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `width` to be able to generate pixel values." + ) + assert self.shapes.get("patch_size", None) is not None, ( + "Patch size must be provided, " + "please provide it in `input_shapes` as `patch_size` to be able to generate pixel values." + ) + assert self.shapes.get("temporal_patch_size", None) is not None, ( + "Temporal patch size must be provided, " + "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate pixel values." + ) + + return self.generate_random_floats( + min_value=0, + max_value=1, + shape=( + self.shapes["num_images"] + * int(self.shapes["height"] / self.shapes["patch_size"]) + * int(self.shapes["width"] / self.shapes["patch_size"]), + self.shapes["num_channels"] + * self.shapes["patch_size"] + * self.shapes["patch_size"] + * self.shapes["temporal_patch_size"], + ), + ) + + def image_grid_thw(self): + assert self.shapes.get("num_images", None) is not None, ( + "Number of images must be provided, " + "please provide it in `input_shapes` as `num_images` to be able to generate image grid." + ) + assert self.shapes.get("height", None) is not None, ( + "Height couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `height` to be able to generate image grid." + ) + assert self.shapes.get("width", None) is not None, ( + "Width couldn't be inferred automatically from model, " + "please provide it in `input_shapes` as `width` to be able to generate image grid." + ) + + return torch.tensor( + [ + [ + self.shapes["num_images"], + int(self.shapes["height"] / self.shapes["patch_size"]), + int(self.shapes["width"] / self.shapes["patch_size"]), + ] + ] + ) + + def __call__(self): + dummy = {} + + dummy["input_ids"] = self.input_ids() + dummy["pixel_values"] = self.pixel_values() + dummy["image_grid_thw"] = self.image_grid_thw() + + print("input_ids", dummy["input_ids"].shape) + print("pixel_values", dummy["pixel_values"].shape) + print("image_grid_thw", dummy["image_grid_thw"].shape) + + if self.with_labels: + dummy["labels"] = self.input_ids() + + return dummy + + TASKS_TO_GENERATORS = { # transformers models tasks "feature-extraction": FeatureExtractionGenerator, @@ -388,6 +719,7 @@ def __call__(self): "image-classification": ImageClassificationGenerator, "object-detection": ObjectDetectionGenerator, "semantic-segmentation": SemanticSegmentationGenerator, + "image-text-to-text": ImageTextToTextGenerationGenerator, # diffusers pipelines tasks "text-to-image": PromptGenerator, "stable-diffusion": PromptGenerator, diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index 337e835ec..0a2a98c2b 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -47,6 +47,7 @@ "image-to-text", "conversational", "text-generation", + "image-text-to-text", "text2text-generation", "automatic-speech-recognition", ] From f2a7a2c67c4269e425d202229b65716dac40fb8c Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 20 Nov 2024 07:38:12 +0100 Subject: [PATCH 2/7] clean up --- .../generators/task_generator.py | 34 +++++++++++++------ .../scenarios/inference/config.py | 5 ++- .../scenarios/inference/scenario.py | 4 +-- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 411130355..b99c918c9 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -145,7 +145,12 @@ def pixel_values(self): return self.generate_random_floats( min_value=0, max_value=1, - shape=(self.shapes["batch_size"], self.shapes["num_channels"], self.shapes["height"], self.shapes["width"]), + shape=( + self.shapes["batch_size"], + self.shapes["num_channels"], + self.shapes["height"], + self.shapes["width"], + ), ) @@ -160,7 +165,12 @@ def input_values(self): "please provide it in `input_shapes` as `sequence_length` to be able to generate input values." ) return self.generate_random_floats( - min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"]) + min_value=-1, + max_value=1, + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"], + ), ) def input_features(self): @@ -180,7 +190,11 @@ def input_features(self): return self.generate_random_floats( min_value=-1, max_value=1, - shape=(self.shapes["batch_size"], self.shapes["feature_size"], self.shapes["nb_max_frames"]), + shape=( + self.shapes["batch_size"], + self.shapes["feature_size"], + self.shapes["nb_max_frames"], + ), ) @@ -286,7 +300,9 @@ def start_positions(self): ) return self.generate_random_integers( - min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) + min_value=0, + max_value=self.shapes["sequence_length"], + shape=(self.shapes["batch_size"],), ) def end_positions(self): @@ -300,7 +316,9 @@ def end_positions(self): ) return self.generate_random_integers( - min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) + min_value=0, + max_value=self.shapes["sequence_length"], + shape=(self.shapes["batch_size"],), ) def __call__(self): @@ -557,7 +575,7 @@ def __call__(self): return dummy -class ImageTextToTextGenerationGenerator(TextGenerator, ImageGenerator): +class ImageTextToTextGenerationGenerator(TaskGenerator): def input_ids(self): assert self.shapes.get("batch_size", None) is not None, ( "Batch size must be provided, " @@ -696,10 +714,6 @@ def __call__(self): dummy["pixel_values"] = self.pixel_values() dummy["image_grid_thw"] = self.image_grid_thw() - print("input_ids", dummy["input_ids"].shape) - print("pixel_values", dummy["pixel_values"].shape) - print("image_grid_thw", dummy["image_grid_thw"].shape) - if self.with_labels: dummy["labels"] = self.input_ids() diff --git a/optimum_benchmark/scenarios/inference/config.py b/optimum_benchmark/scenarios/inference/config.py index 2c05d97f8..57d482abf 100644 --- a/optimum_benchmark/scenarios/inference/config.py +++ b/optimum_benchmark/scenarios/inference/config.py @@ -7,7 +7,10 @@ LOGGER = getLogger("inference") -INPUT_SHAPES = {"batch_size": 2, "num_choices": 2, "sequence_length": 16} +INPUT_SHAPES = { + "batch_size": 2, + "sequence_length": 16, +} @dataclass diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py index f2f18e0b1..8b3bb1b76 100644 --- a/optimum_benchmark/scenarios/inference/scenario.py +++ b/optimum_benchmark/scenarios/inference/scenario.py @@ -414,8 +414,8 @@ def atomic_call_volume(self) -> int: # in images @property def atomic_prefill_volume(self) -> int: # in tokens if {"input_ids", "prompt", "prompts"} & set(self.inputs.keys()): - # text conditioned generation (1 bos token or sequence_length tokens) - return self.config.input_shapes["batch_size"] * max(self.config.input_shapes["sequence_length"], 1) + # text conditioned generation (sequence_length tokens) + return self.config.input_shapes["batch_size"] * self.config.input_shapes["sequence_length"] else: # image/audio conditioned generation (1 bos token) return self.config.input_shapes["batch_size"] From 9a854ae7750937d7297d8777be4c65e7ff6994f0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 20 Nov 2024 08:41:02 +0100 Subject: [PATCH 3/7] simpler --- optimum_benchmark/backends/base.py | 1 - optimum_benchmark/backends/timm_utils.py | 9 +- .../backends/transformers_utils.py | 135 +++++---- .../generators/dataset_generator.py | 6 +- .../generators/input_generator.py | 4 +- .../generators/task_generator.py | 268 +++--------------- test.py | 53 ++++ 7 files changed, 179 insertions(+), 297 deletions(-) create mode 100644 test.py diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 8a59ac6d6..44d71ef4b 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -36,7 +36,6 @@ class Backend(Generic[BackendConfigT], ABC): NAME: ClassVar[str] - model_type: str model_shapes: Dict[str, int] pretrained_model: PreTrainedModel diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index d7b11b1dc..dbaf36fd5 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict from transformers import PretrainedConfig @@ -66,12 +67,8 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: shapes["height"] = input_size[1] shapes["width"] = input_size[2] - # classification labels - if "id2label" in artifacts_dict: - id2label = artifacts_dict["id2label"] - shapes["num_labels"] = len(id2label) - elif "num_classes" in artifacts_dict: - shapes["num_labels"] = artifacts_dict["num_classes"] + if "num_classes" not in artifacts_dict: + warnings.warn("Could not extract shapes [num_channels, height, width] from timm model config.") return shapes diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 009c53688..8ecbde01f 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -1,4 +1,3 @@ -import warnings from contextlib import contextmanager from typing import Any, Dict, Optional, Union @@ -7,6 +6,7 @@ from transformers import ( AutoConfig, AutoFeatureExtractor, + AutoImageProcessor, AutoProcessor, AutoTokenizer, FeatureExtractionMixin, @@ -67,7 +67,7 @@ else: TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} -PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, SpecialTokensMixin, ProcessorMixin] +PretrainedProcessor = Union["FeatureExtractionMixin", "ImageProcessingMixin", "SpecialTokensMixin", "ProcessorMixin"] def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig": @@ -92,59 +92,78 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre return AutoFeatureExtractor.from_pretrained(model, **kwargs) except Exception: try: - return AutoTokenizer.from_pretrained(model, **kwargs) + return AutoImageProcessor.from_pretrained(model, **kwargs) except Exception: - return None + try: + return AutoTokenizer.from_pretrained(model, **kwargs) + except Exception: + return None + + +def get_flat_dict(d: Dict[str, Any]) -> Dict[str, Any]: + flat_dict = {} + for k, v in d.items(): + if isinstance(v, dict): + flat_dict.update(get_flat_dict(v)) + else: + flat_dict[k] = v + return flat_dict + + +def get_flat_artifact_dict(artifact: Union[PretrainedConfig, PretrainedProcessor]) -> Dict[str, Any]: + if isinstance(artifact, ProcessorMixin): + artifact_dict = {} + + for attribute in artifact.attributes: + artifact_dict.update(get_flat_artifact_dict(attribute)) + else: + if hasattr(artifact, "to_dict"): + artifact_dict = {k: v for k, v in artifact.to_dict().items() if v is not None} + else: + try: + artifact_dict = {k: getattr(artifact, k) for k in dir(artifact) if getattr(artifact, k) is not None} + except Exception: + pass + + artifact_dict = get_flat_dict(artifact_dict) + + return artifact_dict def extract_transformers_shapes_from_artifacts( - config: Optional["PretrainedConfig"] = None, processor: Optional["PretrainedProcessor"] = None + config: Optional["PretrainedConfig"] = None, + processor: Optional["PretrainedProcessor"] = None, ) -> Dict[str, Any]: - artifacts_dict = {} + flat_artifacts_dict = {} - if config is not None and hasattr(config, "to_dict"): - config_dict = {k: v for k, v in config.to_dict().items() if v is not None} - artifacts_dict.update(config_dict) - elif config is not None: - try: - config_dict = {k: getattr(config, k) for k in dir(config) if isinstance(getattr(config, k), int)} - artifacts_dict.update(config_dict) - except Exception: - warnings.warn(f"Could not extract shapes from config {config}") + if config is not None: + flat_artifacts_dict.update(get_flat_artifact_dict(config)) - if processor is not None and hasattr(processor, "to_dict"): - processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} - artifacts_dict.update(processor_dict) - elif processor is not None: - try: - processor_dict = { - k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int) - } - except Exception: - warnings.warn(f"Could not extract shapes from processor {processor}") + if processor is not None: + flat_artifacts_dict.update(get_flat_artifact_dict(processor)) shapes = {} # text input - if "vocab_size" in artifacts_dict: - shapes["vocab_size"] = artifacts_dict["vocab_size"] + if "vocab_size" in flat_artifacts_dict: + shapes["vocab_size"] = flat_artifacts_dict["vocab_size"] - if "type_vocab_size" in artifacts_dict: - shapes["type_vocab_size"] = artifacts_dict["type_vocab_size"] + if "type_vocab_size" in flat_artifacts_dict: + shapes["type_vocab_size"] = flat_artifacts_dict["type_vocab_size"] - if "max_position_embeddings" in artifacts_dict: - shapes["max_position_embeddings"] = artifacts_dict["max_position_embeddings"] - elif "n_positions" in artifacts_dict: - shapes["max_position_embeddings"] = artifacts_dict["n_positions"] + if "max_position_embeddings" in flat_artifacts_dict: + shapes["max_position_embeddings"] = flat_artifacts_dict["max_position_embeddings"] + elif "n_positions" in flat_artifacts_dict: + shapes["max_position_embeddings"] = flat_artifacts_dict["n_positions"] # image input - if "num_channels" in artifacts_dict: - shapes["num_channels"] = artifacts_dict.get("channels", None) + if "num_channels" in flat_artifacts_dict: + shapes["num_channels"] = flat_artifacts_dict.get("channels", None) - if "image_size" in artifacts_dict: - image_size = artifacts_dict["image_size"] - elif "size" in artifacts_dict: - image_size = artifacts_dict["size"] + if "image_size" in flat_artifacts_dict: + image_size = flat_artifacts_dict["image_size"] + elif "size" in flat_artifacts_dict: + image_size = flat_artifacts_dict["size"] else: image_size = None @@ -161,36 +180,34 @@ def extract_transformers_shapes_from_artifacts( shapes["height"] = list(image_size.values())[0] shapes["width"] = list(image_size.values())[0] - if "input_size" in artifacts_dict: - input_size = artifacts_dict["input_size"] + if "input_size" in flat_artifacts_dict: + input_size = flat_artifacts_dict["input_size"] shapes["num_channels"] = input_size[0] shapes["height"] = input_size[1] shapes["width"] = input_size[2] # classification labels - if "id2label" in artifacts_dict: - id2label = artifacts_dict["id2label"] + if "id2label" in flat_artifacts_dict: + id2label = flat_artifacts_dict["id2label"] shapes["num_labels"] = len(id2label) - elif "num_classes" in artifacts_dict: - shapes["num_labels"] = artifacts_dict["num_classes"] + elif "num_classes" in flat_artifacts_dict: + shapes["num_labels"] = flat_artifacts_dict["num_classes"] # object detection labels - if "num_queries" in artifacts_dict: - shapes["num_queries"] = artifacts_dict["num_queries"] + if "num_queries" in flat_artifacts_dict: + shapes["num_queries"] = flat_artifacts_dict["num_queries"] # image-text input - if "image_token_id" in artifacts_dict: - shapes["image_token_id"] = artifacts_dict["image_token_id"] - - if "vision_config" in artifacts_dict: - if "in_chans" in artifacts_dict["vision_config"]: - shapes["num_channels"] = artifacts_dict["vision_config"]["in_chans"] - if "patch_size" in artifacts_dict["vision_config"]: - shapes["patch_size"] = artifacts_dict["vision_config"]["patch_size"] - if "temporal_patch_size" in artifacts_dict["vision_config"]: - shapes["temporal_patch_size"] = artifacts_dict["vision_config"]["temporal_patch_size"] - if "spatial_merge_size" in artifacts_dict["vision_config"]: - shapes["spatial_merge_size"] = artifacts_dict["vision_config"]["spatial_merge_size"] + if "image_token_id" in flat_artifacts_dict: + shapes["image_token_id"] = flat_artifacts_dict["image_token_id"] + if "in_chans" in flat_artifacts_dict: + shapes["num_channels"] = flat_artifacts_dict["in_chans"] + if "patch_size" in flat_artifacts_dict: + shapes["patch_size"] = flat_artifacts_dict["patch_size"] + if "temporal_patch_size" in flat_artifacts_dict: + shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"] + if "spatial_merge_size" in flat_artifacts_dict: + shapes["spatial_merge_size"] = flat_artifacts_dict["spatial_merge_size"] return shapes diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py index bbaa87f0a..781e20d96 100644 --- a/optimum_benchmark/generators/dataset_generator.py +++ b/optimum_benchmark/generators/dataset_generator.py @@ -9,11 +9,11 @@ class DatasetGenerator: task_generator: TaskGenerator def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None: - dataset_shapes["batch_size"] = dataset_shapes["dataset_size"] + dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size", None) if task in TASKS_TO_GENERATORS: - shapes = {**dataset_shapes, **model_shapes} - self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=True) + all_shapes = {**model_shapes, **dataset_shapes} # dataset_shapes take precedence over model_shapes + self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True) else: raise NotImplementedError( f"Task {task} is supported. \n" diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index 1dd5501a9..10432fa95 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -8,8 +8,8 @@ class InputGenerator: def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None: if task in TASKS_TO_GENERATORS: - shapes = {**input_shapes, **model_shapes} - self.task_generator = TASKS_TO_GENERATORS[task](shapes=shapes, with_labels=False) + all_shapes = {**model_shapes, **input_shapes} # input_shapes take precedence over model_shapes + self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False) else: raise NotImplementedError( f"Task {task} is not supported. " diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index b99c918c9..ccf89cdc1 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -19,6 +19,13 @@ def __init__(self, shapes: Dict[str, int], with_labels: bool): self.shapes = shapes self.with_labels = with_labels + def assert_not_missing_shapes(self, required_shapes: List[str]): + for shape in required_shapes: + assert self.shapes.get(shape, None) is not None, ( + f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. " + f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. " + ) + @staticmethod def generate_constant_integers(value: int, shape: Tuple[int]): return torch.full(shape, value, dtype=torch.int64) @@ -52,14 +59,7 @@ def __call__(self): class TextGenerator(TaskGenerator): def input_ids(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate input ids." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -68,14 +68,7 @@ def input_ids(self): ) def attention_mask(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate attention masks." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate attention masks." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_constant_integers( value=1, # no sparsity @@ -83,14 +76,7 @@ def attention_mask(self): ) def token_type_ids(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate token type ids." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate token type ids." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -99,14 +85,7 @@ def token_type_ids(self): ) def position_ids(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate position ids." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate position ids." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_ranges( start=0, @@ -125,22 +104,7 @@ def requires_position_ids(self): class ImageGenerator(TaskGenerator): def pixel_values(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate pixel values." - ) - assert self.shapes.get("num_channels", None) is not None, ( - "Number of channels couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values." - ) - assert self.shapes.get("height", None) is not None, ( - "Height couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `height` to be able to generate pixel values." - ) - assert self.shapes.get("width", None) is not None, ( - "Width couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `width` to be able to generate pixel values." - ) + self.assert_not_missing_shapes(["batch_size", "num_channels", "height", "width"]) return self.generate_random_floats( min_value=0, @@ -156,14 +120,8 @@ def pixel_values(self): class AudioGenerator(TaskGenerator): def input_values(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate input values." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate input values." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) + return self.generate_random_floats( min_value=-1, max_value=1, @@ -174,18 +132,7 @@ def input_values(self): ) def input_features(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate input features." - ) - assert self.shapes.get("feature_size", None) is not None, ( - "Feature size couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `feature_size` to be able to generate input features." - ) - assert self.shapes.get("nb_max_frames", None) is not None, ( - "Number of max frames couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `nb_max_frames` to be able to generate input features." - ) + self.assert_not_missing_shapes(["batch_size", "feature_size", "nb_max_frames"]) return self.generate_random_floats( min_value=-1, @@ -200,10 +147,7 @@ def input_features(self): class TextClassificationGenerator(TextGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size"]) return self.generate_random_integers( min_value=0, @@ -231,14 +175,7 @@ def __call__(self): class TokenClassificationGenerator(TextGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -290,14 +227,7 @@ def __call__(self): class QuestionAnsweringGenerator(TextGenerator): def start_positions(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate start positions." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate start positions." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -306,14 +236,7 @@ def start_positions(self): ) def end_positions(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate end positions." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate end positions." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -356,14 +279,7 @@ def __call__(self): class MultipleChoiceGenerator(TextGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("num_choices", None) is not None, ( - "Number of choices must be provided, " - "please provide it in `input_shapes` as `num_choices` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size", "num_choices"]) return self.generate_random_integers( min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],) @@ -399,10 +315,7 @@ def __call__(self): class ImageClassificationGenerator(ImageGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size"]) return self.generate_random_integers( min_value=0, @@ -422,14 +335,7 @@ def __call__(self): class ObjectDetectionGenerator(ImageGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("num_queries", None) is not None, ( - "Number of queries must be provided, " - "please provide it in `input_shapes` as `num_queries` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size", "num_queries"]) return [ { @@ -455,18 +361,7 @@ def __call__(self): class SemanticSegmentationGenerator(ImageGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("height", None) is not None, ( - "Height couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `height` to be able to generate labels." - ) - assert self.shapes.get("width", None) is not None, ( - "Width couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `width` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size", "height", "width"]) return self.generate_random_integers( min_value=0, @@ -486,14 +381,7 @@ def __call__(self): class AudioClassificationGenerator(AudioGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("num_labels", None) is not None, ( - "Number of labels couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `num_labels` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size"]) return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],) @@ -511,18 +399,7 @@ def __call__(self): class AutomaticSpeechRecognitionGenerator(AudioGenerator): def labels(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate labels." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate labels." - ) - assert self.shapes.get("num_labels", None) is not None, ( - "Number of labels couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `num_labels` to be able to generate labels." - ) + self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, @@ -542,10 +419,7 @@ def __call__(self): class PromptGenerator(TaskGenerator): def prompt(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate prompts." - ) + self.assert_not_missing_shapes(["batch_size"]) return self.generate_random_strings(num_seq=self.shapes["batch_size"]) @@ -577,45 +451,19 @@ def __call__(self): class ImageTextToTextGenerationGenerator(TaskGenerator): def input_ids(self): - assert self.shapes.get("batch_size", None) is not None, ( - "Batch size must be provided, " - "please provide it in `input_shapes` as `batch_size` to be able to generate input ids." - ) - assert self.shapes.get("sequence_length", None) is not None, ( - "Sequence length must be provided, " - "please provide it in `input_shapes` as `sequence_length` to be able to generate input ids." - ) - assert self.shapes.get("num_images", None) is not None, ( - "Number of images must be provided, " - "please provide it in `input_shapes` as `num_images` to be able to generate input ids." - ) - assert self.shapes.get("num_channels", None) is not None, ( - "Number of channels couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `num_channels` to be able to generate input ids." - ) - assert self.shapes.get("height", None) is not None, ( - "Height couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `height` to be able to generate input ids." - ) - assert self.shapes.get("width", None) is not None, ( - "Width couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `width` to be able to generate input ids." - ) - assert self.shapes.get("patch_size", None) is not None, ( - "Patch size must be provided, " - "please provide it in `input_shapes` as `patch_size` to be able to generate input ids." - ) - assert self.shapes.get("temporal_patch_size", None) is not None, ( - "Temporal patch size must be provided, " - "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate input ids." - ) - assert self.shapes.get("spatial_merge_size", None) is not None, ( - "Spatial merge size must be provided, " - "please provide it in `input_shapes` as `spatial_merge_size` to be able to generate input ids." - ) - assert self.shapes.get("image_token_id", None) is not None, ( - "Image token id must be provided, " - "please provide it in `input_shapes` as `image_token_id` to be able to generate input ids." + self.assert_not_missing_shapes( + [ + "batch_size", + "sequence_length", + "num_images", + "num_channels", + "height", + "width", + "patch_size", + "temporal_patch_size", + "spatial_merge_size", + "image_token_id", + ] ) text_tokens = self.generate_random_integers( @@ -644,29 +492,8 @@ def input_ids(self): return torch.cat((text_tokens, image_tokens), dim=1) def pixel_values(self): - assert self.shapes.get("num_images", None) is not None, ( - "Number of images must be provided, " - "please provide it in `input_shapes` as `num_images` to be able to generate pixel values." - ) - assert self.shapes.get("num_channels", None) is not None, ( - "Number of channels couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `num_channels` to be able to generate pixel values." - ) - assert self.shapes.get("height", None) is not None, ( - "Height couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `height` to be able to generate pixel values." - ) - assert self.shapes.get("width", None) is not None, ( - "Width couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `width` to be able to generate pixel values." - ) - assert self.shapes.get("patch_size", None) is not None, ( - "Patch size must be provided, " - "please provide it in `input_shapes` as `patch_size` to be able to generate pixel values." - ) - assert self.shapes.get("temporal_patch_size", None) is not None, ( - "Temporal patch size must be provided, " - "please provide it in `input_shapes` as `temporal_patch_size` to be able to generate pixel values." + self.assert_not_missing_shapes( + ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"] ) return self.generate_random_floats( @@ -684,18 +511,7 @@ def pixel_values(self): ) def image_grid_thw(self): - assert self.shapes.get("num_images", None) is not None, ( - "Number of images must be provided, " - "please provide it in `input_shapes` as `num_images` to be able to generate image grid." - ) - assert self.shapes.get("height", None) is not None, ( - "Height couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `height` to be able to generate image grid." - ) - assert self.shapes.get("width", None) is not None, ( - "Width couldn't be inferred automatically from model, " - "please provide it in `input_shapes` as `width` to be able to generate image grid." - ) + self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"]) return torch.tensor( [ diff --git a/test.py b/test.py new file mode 100644 index 000000000..6c96b0581 --- /dev/null +++ b/test.py @@ -0,0 +1,53 @@ +from transformers import AutoProcessor, Idefics2Processor + +processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics-9b") +print(processor.to_dict()) + +# dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg" +# dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg" + +# prompts = [ +# [ +# "User:", +# dogs_image_url_1, +# "Describe this image.\nAssistant: An image of two dogs.\n", +# "User:", +# dogs_image_url_2, +# "Describe this image.\nAssistant:", +# ] +# ] + +# inputs = processor(prompts, return_tensors="pt") + +# print("inputs_ids", inputs["input_ids"].shape) +# print("pixel_values", inputs["pixel_values"].shape) + +# batch_size = 1 +# sequence_length = 128 + +# num_images = 1 +# num_channels = 3 +# height = 224 +# width = 224 + +# patch_size = 14 +# temporal_patch_size = 2 + +# input_ids = torch.rand( +# size=( +# batch_size, +# sequence_length, +# ) +# ) + +# pixel_values = torch.rand( +# size=( +# num_images * int(height / patch_size) * int(width / patch_size), +# num_channels * patch_size * patch_size * temporal_patch_size, +# ) +# ) +# image_grid_thw = torch.tensor([[num_images, int(height / patch_size), int(width / patch_size)]]) + + +# print("image_grid_thw", image_grid_thw) +# print("pixel_values", pixel_values.shape) From e5bf852f8ba04df0c11150cccdcc5fe9d3dd8909 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 20 Nov 2024 12:51:49 +0100 Subject: [PATCH 4/7] support idefics and idefics2 --- examples/pytorch_vlm.yaml | 4 +- .../backends/transformers_utils.py | 41 +-- optimum_benchmark/generators/base.py | 52 ++++ .../generators/dataset_generator.py | 38 ++- .../generators/input_generator.py | 35 ++- .../generators/model_generator.py | 259 ++++++++++++++++++ .../generators/task_generator.py | 167 ++--------- .../scenarios/inference/scenario.py | 5 +- 8 files changed, 407 insertions(+), 194 deletions(-) create mode 100644 optimum_benchmark/generators/base.py create mode 100644 optimum_benchmark/generators/model_generator.py diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml index f11c4fcb3..c4bb786fe 100644 --- a/examples/pytorch_vlm.yaml +++ b/examples/pytorch_vlm.yaml @@ -17,7 +17,7 @@ backend: device_ids: 0 no_weights: true torch_dtype: float16 - model: Qwen/Qwen2-VL-7B-Instruct + model: HuggingFaceM4/idefics2-8b scenario: memory: true @@ -32,7 +32,7 @@ scenario: batch_size: 1 sequence_length: 256 # image - num_images: 1 + num_images: 2 num_channels: 3 height: 224 width: 224 diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 8ecbde01f..5c1b18dda 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -111,19 +111,22 @@ def get_flat_dict(d: Dict[str, Any]) -> Dict[str, Any]: def get_flat_artifact_dict(artifact: Union[PretrainedConfig, PretrainedProcessor]) -> Dict[str, Any]: - if isinstance(artifact, ProcessorMixin): - artifact_dict = {} + artifact_dict = {} + if isinstance(artifact, ProcessorMixin): + artifact_dict.update( + {k: v for k, v in artifact.__dict__.items() if isinstance(v, (int, str, float, bool, list, tuple, dict))} + ) for attribute in artifact.attributes: - artifact_dict.update(get_flat_artifact_dict(attribute)) + artifact_dict.update(get_flat_artifact_dict(getattr(artifact, attribute))) + elif hasattr(artifact, "to_dict"): + artifact_dict.update( + {k: v for k, v in artifact.to_dict().items() if isinstance(v, (int, str, float, bool, list, tuple, dict))} + ) else: - if hasattr(artifact, "to_dict"): - artifact_dict = {k: v for k, v in artifact.to_dict().items() if v is not None} - else: - try: - artifact_dict = {k: getattr(artifact, k) for k in dir(artifact) if getattr(artifact, k) is not None} - except Exception: - pass + artifact_dict.update( + {k: v for k, v in artifact.__dict__.items() if isinstance(v, (int, str, float, bool, list, tuple, dict))} + ) artifact_dict = get_flat_dict(artifact_dict) @@ -198,16 +201,22 @@ def extract_transformers_shapes_from_artifacts( shapes["num_queries"] = flat_artifacts_dict["num_queries"] # image-text input - if "image_token_id" in flat_artifacts_dict: - shapes["image_token_id"] = flat_artifacts_dict["image_token_id"] - if "in_chans" in flat_artifacts_dict: - shapes["num_channels"] = flat_artifacts_dict["in_chans"] + if "patch_size" in flat_artifacts_dict: shapes["patch_size"] = flat_artifacts_dict["patch_size"] - if "temporal_patch_size" in flat_artifacts_dict: - shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"] + if "in_chans" in flat_artifacts_dict: + shapes["num_channels"] = flat_artifacts_dict["in_chans"] + if "image_seq_len" in flat_artifacts_dict: + shapes["image_seq_len"] = flat_artifacts_dict["image_seq_len"] + if "image_token_id" in flat_artifacts_dict: + shapes["image_token_id"] = flat_artifacts_dict["image_token_id"] if "spatial_merge_size" in flat_artifacts_dict: shapes["spatial_merge_size"] = flat_artifacts_dict["spatial_merge_size"] + if "do_image_splitting" in flat_artifacts_dict: + shapes["do_image_splitting"] = flat_artifacts_dict["do_image_splitting"] + + if "temporal_patch_size" in flat_artifacts_dict: + shapes["temporal_patch_size"] = flat_artifacts_dict["temporal_patch_size"] return shapes diff --git a/optimum_benchmark/generators/base.py b/optimum_benchmark/generators/base.py new file mode 100644 index 000000000..e4d779b93 --- /dev/null +++ b/optimum_benchmark/generators/base.py @@ -0,0 +1,52 @@ +import logging +import random +import string +from abc import ABC +from typing import Dict, List, Tuple + +import torch + +LOGGER = logging.getLogger("generators") + + +class BaseGenerator(ABC): + def __init__(self, shapes: Dict[str, int], with_labels: bool): + self.shapes = shapes + self.with_labels = with_labels + + def assert_not_missing_shapes(self, required_shapes: List[str]): + for shape in required_shapes: + assert self.shapes.get(shape, None) is not None, ( + f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. " + f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. " + ) + + @staticmethod + def generate_constant_integers(value: int, shape: Tuple[int]): + return torch.full(shape, value, dtype=torch.int64) + + @staticmethod + def generate_constant_floats(value: float, shape: Tuple[int]): + return torch.full(shape, value, dtype=torch.float32) + + @staticmethod + def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]): + return torch.randint(min_value, max_value, shape) + + @staticmethod + def generate_random_floats(min_value: float, max_value: float, shape: Tuple[int]): + return torch.rand(shape) * (max_value - min_value) + min_value + + @staticmethod + def generate_ranges(start: int, stop: int, shape: Tuple[int]): + return torch.arange(start, stop).repeat(shape[0], 1) + + @staticmethod + def generate_random_strings(num_seq: int) -> List[str]: + return [ + "".join(random.choice(string.ascii_letters + string.digits) for _ in range(random.randint(10, 100))) + for _ in range(num_seq) + ] + + def __call__(self): + raise NotImplementedError("Generator must implement __call__ method") diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py index 781e20d96..efc8a0294 100644 --- a/optimum_benchmark/generators/dataset_generator.py +++ b/optimum_benchmark/generators/dataset_generator.py @@ -1,29 +1,41 @@ -from typing import Dict +from typing import Dict, Optional from datasets import Dataset -from .task_generator import TASKS_TO_GENERATORS, TaskGenerator +from .base import BaseGenerator +from .model_generator import MODEL_TYPE_TO_GENERATORS +from .task_generator import TASKS_TO_GENERATORS class DatasetGenerator: - task_generator: TaskGenerator + generator: BaseGenerator - def __init__(self, task: str, dataset_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None: - dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size", None) + def __init__( + self, + task: str, + dataset_shapes: Dict[str, int], + model_shapes: Dict[str, int], + model_type: Optional[str] = None, + ) -> None: + # dataset_shapes take precedence over model_shapes + all_shapes = {**model_shapes, **dataset_shapes} + all_shapes["batch_size"] = all_shapes.pop("dataset_size", None) - if task in TASKS_TO_GENERATORS: - all_shapes = {**model_shapes, **dataset_shapes} # dataset_shapes take precedence over model_shapes - self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True) + if model_type in MODEL_TYPE_TO_GENERATORS: + self.generator = MODEL_TYPE_TO_GENERATORS[model_type](shapes=all_shapes, with_labels=True) + elif task in TASKS_TO_GENERATORS: + self.generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=True) else: raise NotImplementedError( - f"Task {task} is supported. \n" - f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n" - "If you want to add support for this task, " - "please submit a PR or a feature request to optimum-benchmark. \n" + f"Task {task} is not supported for dataset generation. " + f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. " + f"Available model types: {list(MODEL_TYPE_TO_GENERATORS.keys())}. " + "If you want to add support for this task or model type, " + "please submit a PR or a feature request to optimum-benchmark." ) def __call__(self) -> Dataset: - task_dataset = self.task_generator() + task_dataset = self.generator() task_dataset = Dataset.from_dict(task_dataset) task_dataset.set_format(type="torch", columns=list(task_dataset.features.keys())) return task_dataset diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index 10432fa95..2f05dc62f 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -1,23 +1,36 @@ -from typing import Any, Dict +from typing import Any, Dict, Optional -from .task_generator import TASKS_TO_GENERATORS, TaskGenerator +from .base import BaseGenerator +from .model_generator import MODEL_TYPE_TO_GENERATORS +from .task_generator import TASKS_TO_GENERATORS class InputGenerator: - task_generator: TaskGenerator + generator: BaseGenerator - def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[str, int]) -> None: - if task in TASKS_TO_GENERATORS: - all_shapes = {**model_shapes, **input_shapes} # input_shapes take precedence over model_shapes - self.task_generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False) + def __init__( + self, + task: str, + input_shapes: Dict[str, int], + model_shapes: Dict[str, int], + model_type: Optional[str] = None, + ) -> None: + # input_shapes take precedence over model_shapes + all_shapes = {**model_shapes, **input_shapes} + + if model_type in MODEL_TYPE_TO_GENERATORS: + self.generator = MODEL_TYPE_TO_GENERATORS[model_type](shapes=all_shapes, with_labels=False) + elif task in TASKS_TO_GENERATORS: + self.generator = TASKS_TO_GENERATORS[task](shapes=all_shapes, with_labels=False) else: raise NotImplementedError( - f"Task {task} is not supported. " + f"Task {task} is not supported for input generation. " f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. " - "If you want to add support for this task, " - "please submit a PR or a feature request to optimum-benchmark. " + f"Available model types: {list(MODEL_TYPE_TO_GENERATORS.keys())}. " + "If you want to add support for this task or model type, " + "please submit a PR or a feature request to optimum-benchmark." ) def __call__(self) -> Dict[str, Any]: - task_input = self.task_generator() + task_input = self.generator() return task_input diff --git a/optimum_benchmark/generators/model_generator.py b/optimum_benchmark/generators/model_generator.py new file mode 100644 index 000000000..711eeed14 --- /dev/null +++ b/optimum_benchmark/generators/model_generator.py @@ -0,0 +1,259 @@ +import logging + +import torch + +from .base import BaseGenerator + +LOGGER = logging.getLogger("generators") + +DEFAULT_VOCAB_SIZE = 2 + + +class IdeficsGenerator(BaseGenerator): + def input_ids(self): + self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "image_token_id"]) + + text_tokens = self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), + ) + + image_tokens = self.generate_constant_integers( + value=self.shapes["image_token_id"], + shape=(self.shapes["batch_size"], self.shapes["num_images"]), + ) + + return torch.cat((text_tokens, image_tokens), dim=1) + + def attention_mask(self): + self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images"]) + + return self.generate_constant_integers( + value=1, # no sparsity + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"] + self.shapes["num_images"], + ), + ) + + def pixel_values(self): + self.assert_not_missing_shapes(["batch_size", "num_images", "num_channels", "height", "width"]) + + return self.generate_random_floats( + min_value=0, + max_value=1, + shape=( + self.shapes["batch_size"], + self.shapes["num_images"], + self.shapes["num_channels"], + self.shapes["height"], + self.shapes["width"], + ), + ) + + def image_attention_mask(self): + self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images"]) + + return self.generate_constant_integers( + value=1, # no sparsity + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"] + self.shapes["num_images"], + self.shapes["num_images"], + ), + ) + + def __call__(self): + dummy = {} + + dummy["input_ids"] = self.input_ids() + dummy["pixel_values"] = self.pixel_values() + dummy["attention_mask"] = self.attention_mask() + dummy["image_attention_mask"] = self.image_attention_mask() + + if self.with_labels: + dummy["labels"] = self.input_ids() + + return dummy + + +class Idefics2Generator(BaseGenerator): + def input_ids(self): + self.assert_not_missing_shapes( + ["batch_size", "sequence_length", "num_images", "image_seq_len", "image_token_id", "do_image_splitting"] + ) + + text_tokens = self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), + ) + + image_tokens = self.generate_constant_integers( + value=self.shapes["image_token_id"], + shape=( + self.shapes["batch_size"], + self.shapes["num_images"] + * self.shapes["image_seq_len"] + * (5 if self.shapes["do_image_splitting"] else 1), + ), + ) + + return torch.cat((text_tokens, image_tokens), dim=1) + + def attention_mask(self): + self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "do_image_splitting"]) + + return self.generate_constant_integers( + value=1, # no sparsity + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"] + + self.shapes["num_images"] + * self.shapes["image_seq_len"] + * (5 if self.shapes["do_image_splitting"] else 1), + ), + ) + + def pixel_values(self): + self.assert_not_missing_shapes( + ["batch_size", "num_images", "num_channels", "height", "width", "do_image_splitting"] + ) + + return self.generate_random_floats( + min_value=0, + max_value=1, + shape=( + self.shapes["batch_size"], + self.shapes["num_images"] * (5 if self.shapes["do_image_splitting"] else 1), + self.shapes["num_channels"], + self.shapes["height"], + self.shapes["width"], + ), + ) + + def pixel_attention_mask(self): + self.assert_not_missing_shapes(["batch_size", "sequence_length", "num_images", "do_image_splitting"]) + + return self.generate_constant_integers( + value=1, # no sparsity + shape=( + self.shapes["batch_size"], + self.shapes["num_images"] * (5 if self.shapes["do_image_splitting"] else 1), + self.shapes["height"], + self.shapes["width"], + ), + ) + + def __call__(self): + dummy = {} + + dummy["input_ids"] = self.input_ids() + dummy["pixel_values"] = self.pixel_values() + dummy["attention_mask"] = self.attention_mask() + dummy["pixel_attention_mask"] = self.pixel_attention_mask() + + print("input_ids", dummy["input_ids"].shape) + print("pixel_values", dummy["pixel_values"].shape) + print("attention_mask", dummy["attention_mask"].shape) + print("pixel_attention_mask", dummy["pixel_attention_mask"].shape) + + if self.with_labels: + dummy["labels"] = self.input_ids() + + return dummy + + +class Qwen2VLGenerator(BaseGenerator): + def input_ids(self): + self.assert_not_missing_shapes( + [ + "batch_size", + "sequence_length", + "num_images", + "num_channels", + "height", + "width", + "patch_size", + "temporal_patch_size", + "spatial_merge_size", + "image_token_id", + ] + ) + + text_tokens = self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE), + shape=( + self.shapes["batch_size"], + self.shapes["sequence_length"], + ), + ) + image_tokens = self.generate_constant_integers( + value=self.shapes["image_token_id"], + shape=( + self.shapes["batch_size"], + int( + self.shapes["num_images"] + * self.shapes["height"] + * self.shapes["width"] + / self.shapes["temporal_patch_size"] + / self.shapes["spatial_merge_size"] + / self.shapes["patch_size"] ** 2 + ), + ), + ) + + return torch.cat((text_tokens, image_tokens), dim=1) + + def pixel_values(self): + self.assert_not_missing_shapes( + ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"] + ) + + return self.generate_random_floats( + min_value=0, + max_value=1, + shape=( + self.shapes["num_images"] + * int(self.shapes["height"] / self.shapes["patch_size"]) + * int(self.shapes["width"] / self.shapes["patch_size"]), + self.shapes["num_channels"] + * self.shapes["patch_size"] + * self.shapes["patch_size"] + * self.shapes["temporal_patch_size"], + ), + ) + + def image_grid_thw(self): + self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"]) + + return torch.tensor( + [ + [ + self.shapes["num_images"], + int(self.shapes["height"] / self.shapes["patch_size"]), + int(self.shapes["width"] / self.shapes["patch_size"]), + ] + ] + ) + + def __call__(self): + dummy = {} + + dummy["input_ids"] = self.input_ids() + dummy["pixel_values"] = self.pixel_values() + dummy["image_grid_thw"] = self.image_grid_thw() + + if self.with_labels: + dummy["labels"] = self.input_ids() + + return dummy + + +MODEL_TYPE_TO_GENERATORS = { + "idefics": IdeficsGenerator, + "idefics2": Idefics2Generator, + "qwen2-vl": Qwen2VLGenerator, +} diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index ccf89cdc1..c0f37d14e 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -1,11 +1,6 @@ import logging -import random -import string -from abc import ABC -from typing import Dict, List, Tuple -# TODO: drop torch dependency and use numpy instead -import torch +from .base import BaseGenerator LOGGER = logging.getLogger("generators") @@ -14,56 +9,13 @@ DEFAULT_TYPE_VOCAB_SIZE = 2 -class TaskGenerator(ABC): - def __init__(self, shapes: Dict[str, int], with_labels: bool): - self.shapes = shapes - self.with_labels = with_labels - - def assert_not_missing_shapes(self, required_shapes: List[str]): - for shape in required_shapes: - assert self.shapes.get(shape, None) is not None, ( - f"{shape} either couldn't be inferred automatically from model artifacts or should be provided by the user. " - f"Please provide it under `scenario.input_shapes.{shape}` or open an issue/PR in optimum-benchmark repository. " - ) - - @staticmethod - def generate_constant_integers(value: int, shape: Tuple[int]): - return torch.full(shape, value, dtype=torch.int64) - - @staticmethod - def generate_constant_floats(value: float, shape: Tuple[int]): - return torch.full(shape, value, dtype=torch.float32) - - @staticmethod - def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]): - return torch.randint(min_value, max_value, shape) - - @staticmethod - def generate_random_floats(min_value: float, max_value: float, shape: Tuple[int]): - return torch.rand(shape) * (max_value - min_value) + min_value - - @staticmethod - def generate_ranges(start: int, stop: int, shape: Tuple[int]): - return torch.arange(start, stop).repeat(shape[0], 1) - - @staticmethod - def generate_random_strings(num_seq: int) -> List[str]: - return [ - "".join(random.choice(string.ascii_letters + string.digits) for _ in range(random.randint(10, 100))) - for _ in range(num_seq) - ] - - def __call__(self): - raise NotImplementedError("Generator must implement __call__ method") - - -class TextGenerator(TaskGenerator): +class TextGenerator(BaseGenerator): def input_ids(self): self.assert_not_missing_shapes(["batch_size", "sequence_length"]) return self.generate_random_integers( min_value=0, - max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE, + max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE), shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) @@ -80,7 +32,7 @@ def token_type_ids(self): return self.generate_random_integers( min_value=0, - max_value=self.shapes.get("type_vocab_size", None) or DEFAULT_TYPE_VOCAB_SIZE, + max_value=self.shapes.get("type_vocab_size", DEFAULT_TYPE_VOCAB_SIZE), shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) @@ -102,7 +54,7 @@ def requires_position_ids(self): ) -class ImageGenerator(TaskGenerator): +class ImageGenerator(BaseGenerator): def pixel_values(self): self.assert_not_missing_shapes(["batch_size", "num_channels", "height", "width"]) @@ -118,7 +70,7 @@ def pixel_values(self): ) -class AudioGenerator(TaskGenerator): +class AudioGenerator(BaseGenerator): def input_values(self): self.assert_not_missing_shapes(["batch_size", "sequence_length"]) @@ -151,7 +103,7 @@ def labels(self): return self.generate_random_integers( min_value=0, - max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"],), ) @@ -179,7 +131,7 @@ def labels(self): return self.generate_random_integers( min_value=0, - max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) @@ -319,7 +271,7 @@ def labels(self): return self.generate_random_integers( min_value=0, - max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"],), ) @@ -341,7 +293,7 @@ def labels(self): { "class_labels": self.generate_random_integers( min_value=0, - max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["num_queries"],), ), "boxes": self.generate_random_floats(min_value=-1, max_value=1, shape=(self.shapes["num_queries"], 4)), @@ -365,7 +317,7 @@ def labels(self): return self.generate_random_integers( min_value=0, - max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, + max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"], self.shapes["height"], self.shapes["width"]), ) @@ -384,7 +336,7 @@ def labels(self): self.assert_not_missing_shapes(["batch_size"]) return self.generate_random_integers( - min_value=0, max_value=self.shapes["num_labels"] or DEFAULT_NUM_LABELS, shape=(self.shapes["batch_size"],) + min_value=0, max_value=self.shapes.get("num_labels", DEFAULT_NUM_LABELS), shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -417,7 +369,7 @@ def __call__(self): return dummy -class PromptGenerator(TaskGenerator): +class PromptGenerator(BaseGenerator): def prompt(self): self.assert_not_missing_shapes(["batch_size"]) @@ -434,9 +386,7 @@ class FeatureExtractionGenerator(TextGenerator, ImageGenerator): def __call__(self): dummy = {} - if self.shapes.get("num_channels", None) is not None and self.shapes.get("height", None) is not None: - dummy["pixel_values"] = self.pixel_values() - else: + if self.shapes.get("sequence_length", None) is not None: dummy["input_ids"] = self.input_ids() dummy["attention_mask"] = self.attention_mask() @@ -446,92 +396,8 @@ def __call__(self): if self.requires_position_ids(): dummy["position_ids"] = self.position_ids() - return dummy - - -class ImageTextToTextGenerationGenerator(TaskGenerator): - def input_ids(self): - self.assert_not_missing_shapes( - [ - "batch_size", - "sequence_length", - "num_images", - "num_channels", - "height", - "width", - "patch_size", - "temporal_patch_size", - "spatial_merge_size", - "image_token_id", - ] - ) - - text_tokens = self.generate_random_integers( - min_value=0, - max_value=self.shapes.get("vocab_size", None) or DEFAULT_VOCAB_SIZE, - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), - ) - image_tokens = self.generate_constant_integers( - value=self.shapes["image_token_id"], - shape=( - self.shapes["batch_size"], - int( - self.shapes["num_images"] - * self.shapes["height"] - * self.shapes["width"] - / self.shapes["temporal_patch_size"] - / self.shapes["spatial_merge_size"] - / self.shapes["patch_size"] ** 2 - ), - ), - ) - - return torch.cat((text_tokens, image_tokens), dim=1) - - def pixel_values(self): - self.assert_not_missing_shapes( - ["num_images", "num_channels", "height", "width", "patch_size", "temporal_patch_size"] - ) - - return self.generate_random_floats( - min_value=0, - max_value=1, - shape=( - self.shapes["num_images"] - * int(self.shapes["height"] / self.shapes["patch_size"]) - * int(self.shapes["width"] / self.shapes["patch_size"]), - self.shapes["num_channels"] - * self.shapes["patch_size"] - * self.shapes["patch_size"] - * self.shapes["temporal_patch_size"], - ), - ) - - def image_grid_thw(self): - self.assert_not_missing_shapes(["num_images", "height", "width", "patch_size"]) - - return torch.tensor( - [ - [ - self.shapes["num_images"], - int(self.shapes["height"] / self.shapes["patch_size"]), - int(self.shapes["width"] / self.shapes["patch_size"]), - ] - ] - ) - - def __call__(self): - dummy = {} - - dummy["input_ids"] = self.input_ids() - dummy["pixel_values"] = self.pixel_values() - dummy["image_grid_thw"] = self.image_grid_thw() - - if self.with_labels: - dummy["labels"] = self.input_ids() + if self.shapes.get("height", None) is not None: + dummy["pixel_values"] = self.pixel_values() return dummy @@ -549,7 +415,6 @@ def __call__(self): "image-classification": ImageClassificationGenerator, "object-detection": ObjectDetectionGenerator, "semantic-segmentation": SemanticSegmentationGenerator, - "image-text-to-text": ImageTextToTextGenerationGenerator, # diffusers pipelines tasks "text-to-image": PromptGenerator, "stable-diffusion": PromptGenerator, diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py index 8b3bb1b76..28182adb5 100644 --- a/optimum_benchmark/scenarios/inference/scenario.py +++ b/optimum_benchmark/scenarios/inference/scenario.py @@ -60,7 +60,10 @@ def __init__(self, config: InferenceConfig) -> None: def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: self.logger.info("\t+ Creating input generator") self.input_generator = InputGenerator( - task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes + task=backend.config.task, + input_shapes=self.config.input_shapes, + model_shapes=backend.model_shapes, + model_type=backend.config.model_type, ) if backend.config.task in TEXT_GENERATION_TASKS: From 44caa158f12f4302e6f0709099129893eb1a833f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 20 Nov 2024 12:57:39 +0100 Subject: [PATCH 5/7] remove file --- test.py | 53 ----------------------------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 6c96b0581..000000000 --- a/test.py +++ /dev/null @@ -1,53 +0,0 @@ -from transformers import AutoProcessor, Idefics2Processor - -processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics-9b") -print(processor.to_dict()) - -# dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg" -# dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg" - -# prompts = [ -# [ -# "User:", -# dogs_image_url_1, -# "Describe this image.\nAssistant: An image of two dogs.\n", -# "User:", -# dogs_image_url_2, -# "Describe this image.\nAssistant:", -# ] -# ] - -# inputs = processor(prompts, return_tensors="pt") - -# print("inputs_ids", inputs["input_ids"].shape) -# print("pixel_values", inputs["pixel_values"].shape) - -# batch_size = 1 -# sequence_length = 128 - -# num_images = 1 -# num_channels = 3 -# height = 224 -# width = 224 - -# patch_size = 14 -# temporal_patch_size = 2 - -# input_ids = torch.rand( -# size=( -# batch_size, -# sequence_length, -# ) -# ) - -# pixel_values = torch.rand( -# size=( -# num_images * int(height / patch_size) * int(width / patch_size), -# num_channels * patch_size * patch_size * temporal_patch_size, -# ) -# ) -# image_grid_thw = torch.tensor([[num_images, int(height / patch_size), int(width / patch_size)]]) - - -# print("image_grid_thw", image_grid_thw) -# print("pixel_values", pixel_values.shape) From 2248f8e319c103af1351319c2fec5c98dfbb25f2 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 20 Nov 2024 13:36:45 +0100 Subject: [PATCH 6/7] support generic image-text-to-text as well (blip, blip2, ..) --- examples/pytorch_vlm.yaml | 2 +- optimum_benchmark/backends/transformers_utils.py | 2 +- optimum_benchmark/generators/model_generator.py | 2 +- optimum_benchmark/generators/task_generator.py | 15 +++++++++++++++ optimum_benchmark/scenarios/inference/scenario.py | 2 -- tests/configs/_image_text_to_text_.yaml | 7 +++++++ .../cpu_inference_pytorch_image_text_to_text.yaml | 11 +++++++++++ 7 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 tests/configs/_image_text_to_text_.yaml create mode 100644 tests/configs/cpu_inference_pytorch_image_text_to_text.yaml diff --git a/examples/pytorch_vlm.yaml b/examples/pytorch_vlm.yaml index c4bb786fe..a39f8c8aa 100644 --- a/examples/pytorch_vlm.yaml +++ b/examples/pytorch_vlm.yaml @@ -17,7 +17,7 @@ backend: device_ids: 0 no_weights: true torch_dtype: float16 - model: HuggingFaceM4/idefics2-8b + model: Qwen/Qwen2-VL-7B-Instruct scenario: memory: true diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 5c1b18dda..7e39c9294 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -161,7 +161,7 @@ def extract_transformers_shapes_from_artifacts( # image input if "num_channels" in flat_artifacts_dict: - shapes["num_channels"] = flat_artifacts_dict.get("channels", None) + shapes["num_channels"] = flat_artifacts_dict["num_channels"] if "image_size" in flat_artifacts_dict: image_size = flat_artifacts_dict["image_size"] diff --git a/optimum_benchmark/generators/model_generator.py b/optimum_benchmark/generators/model_generator.py index 711eeed14..e709398a7 100644 --- a/optimum_benchmark/generators/model_generator.py +++ b/optimum_benchmark/generators/model_generator.py @@ -255,5 +255,5 @@ def __call__(self): MODEL_TYPE_TO_GENERATORS = { "idefics": IdeficsGenerator, "idefics2": Idefics2Generator, - "qwen2-vl": Qwen2VLGenerator, + "qwen2_vl": Qwen2VLGenerator, } diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index c0f37d14e..9f6834d38 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -402,6 +402,20 @@ def __call__(self): return dummy +class ImageTextToTextGenerator(TextGenerator, ImageGenerator): + def __call__(self): + dummy = {} + + dummy["input_ids"] = self.input_ids() + dummy["attention_mask"] = self.attention_mask() + dummy["pixel_values"] = self.pixel_values() + + if self.with_labels: + dummy["labels"] = self.input_ids() + + return dummy + + TASKS_TO_GENERATORS = { # transformers models tasks "feature-extraction": FeatureExtractionGenerator, @@ -415,6 +429,7 @@ def __call__(self): "image-classification": ImageClassificationGenerator, "object-detection": ObjectDetectionGenerator, "semantic-segmentation": SemanticSegmentationGenerator, + "image-text-to-text": ImageTextToTextGenerator, # diffusers pipelines tasks "text-to-image": PromptGenerator, "stable-diffusion": PromptGenerator, diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py index 28182adb5..512f269df 100644 --- a/optimum_benchmark/scenarios/inference/scenario.py +++ b/optimum_benchmark/scenarios/inference/scenario.py @@ -21,8 +21,6 @@ "min_new_tokens": 100, "do_sample": False, "use_cache": True, - "pad_token_id": 0, - "eos_token_id": 0, "num_beams": 1, } TEXT_GENERATION_PREFILL_OVERRIDES = { diff --git a/tests/configs/_image_text_to_text_.yaml b/tests/configs/_image_text_to_text_.yaml new file mode 100644 index 000000000..aa8357f75 --- /dev/null +++ b/tests/configs/_image_text_to_text_.yaml @@ -0,0 +1,7 @@ +hydra: + mode: MULTIRUN + sweeper: + params: + backend.task: image-text-to-text + backend.model: hf-internal-testing/tiny-random-BlipForConditionalGeneration,hf-internal-testing/tiny-random-Blip2ForConditionalGeneration,hf-internal-testing/tiny-random-IdeficsForVisionText2Text + +scenario.input_shapes.num_images: 2 diff --git a/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml b/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml new file mode 100644 index 000000000..df125a3ac --- /dev/null +++ b/tests/configs/cpu_inference_pytorch_image_text_to_text.yaml @@ -0,0 +1,11 @@ +defaults: + # order of inheritance, last one overrides previous ones + - _base_ # inherits from base config + - _cpu_ # inherits from cpu config + - _inference_ # inherits from inference config + - _image_text_to_text_ # inherits from image text to text config + - _no_weights_ # inherits from no weights config + - _self_ # hydra 1.1 compatibility + - override backend: pytorch + +name: cpu_inference_pytorch_image_text_to_text From 62746cc71e349121e6cbdbb90ed4a20223be0253 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 21 Nov 2024 14:22:57 +0100 Subject: [PATCH 7/7] num_choices in tests --- .../backends/transformers_utils.py | 7 ++- .../generators/task_generator.py | 45 ++++++++++++------- tests/configs/_image_text_to_text_.yaml | 5 ++- tests/test_api.py | 17 ++++++- 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 7e39c9294..2212cd5ff 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -62,8 +62,11 @@ model_loaders = (model_loaders,) for model_loader_name in model_loaders: - model_loader_class = getattr(transformers, model_loader_name) - TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update(model_loader_class._model_mapping._model_mapping) + model_loader_class = getattr(transformers, model_loader_name, None) + if model_loader_class is not None: + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update( + model_loader_class._model_mapping._model_mapping + ) else: TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 9f6834d38..f11d21eb0 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -230,6 +230,32 @@ def __call__(self): class MultipleChoiceGenerator(TextGenerator): + def input_ids(self): + self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"]) + + return self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("vocab_size", DEFAULT_VOCAB_SIZE), + shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]), + ) + + def attention_mask(self): + self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"]) + + return self.generate_constant_integers( + value=1, # no sparsity + shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]), + ) + + def token_type_ids(self): + self.assert_not_missing_shapes(["batch_size", "num_choices", "sequence_length"]) + + return self.generate_random_integers( + min_value=0, + max_value=self.shapes.get("type_vocab_size", DEFAULT_TYPE_VOCAB_SIZE), + shape=(self.shapes["batch_size"], self.shapes["num_choices"], self.shapes["sequence_length"]), + ) + def labels(self): self.assert_not_missing_shapes(["batch_size", "num_choices"]) @@ -240,24 +266,11 @@ def labels(self): def __call__(self): dummy = {} - dummy["input_ids"] = ( - self.input_ids() - .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"]) - .repeat(1, self.shapes["num_choices"], 1) - ) - - dummy["attention_mask"] = ( - self.attention_mask() - .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"]) - .repeat(1, self.shapes["num_choices"], 1) - ) + dummy["input_ids"] = self.input_ids() + dummy["attention_mask"] = self.attention_mask() if self.requires_token_type_ids(): - dummy["token_type_ids"] = ( - self.token_type_ids() - .reshape(self.shapes["batch_size"], 1, self.shapes["sequence_length"]) - .repeat(1, self.shapes["num_choices"], 1) - ) + dummy["token_type_ids"] = self.token_type_ids() if self.with_labels: dummy["label"] = self.labels() diff --git a/tests/configs/_image_text_to_text_.yaml b/tests/configs/_image_text_to_text_.yaml index aa8357f75..20043a674 100644 --- a/tests/configs/_image_text_to_text_.yaml +++ b/tests/configs/_image_text_to_text_.yaml @@ -3,5 +3,8 @@ hydra: sweeper: params: backend.task: image-text-to-text - backend.model: hf-internal-testing/tiny-random-BlipForConditionalGeneration,hf-internal-testing/tiny-random-Blip2ForConditionalGeneration,hf-internal-testing/tiny-random-IdeficsForVisionText2Text + backend.model: hf-internal-testing/tiny-random-GitForCausalLM, + hf-internal-testing/tiny-random-BlipForConditionalGeneration, + hf-internal-testing/tiny-random-Blip2ForConditionalGeneration, + hf-internal-testing/tiny-random-IdeficsForVisionText2Text +scenario.input_shapes.num_images: 2 diff --git a/tests/test_api.py b/tests/test_api.py index 66ee16f95..fd6e2dac1 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -47,6 +47,9 @@ def test_api_launch(device, scenario, library, task, model): benchmark_name = f"{device}_{scenario}_{library}_{task}_{model}" + if task == "multiple-choice": + INPUT_SHAPES["num_choices"] = 2 + if device == "cuda": device_isolation = True if is_rocm_system(): @@ -82,7 +85,7 @@ def test_api_launch(device, scenario, library, task, model): duration=1, iterations=1, warmup_runs=1, - input_shapes={"batch_size": 1, "sequence_length": 2}, + input_shapes=INPUT_SHAPES, generate_kwargs={"max_new_tokens": 2, "min_new_tokens": 2}, call_kwargs={"num_inference_steps": 2}, ) @@ -170,7 +173,14 @@ def test_api_input_generator(library, task, model): else: raise ValueError(f"Unknown library {library}") - input_generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes) + if task == "multiple-choice": + INPUT_SHAPES["num_choices"] = 2 + + input_generator = InputGenerator( + task=task, + input_shapes=INPUT_SHAPES, + model_shapes=model_shapes, + ) generated_inputs = input_generator() assert len(generated_inputs) > 0, "No inputs were generated" @@ -193,6 +203,9 @@ def test_api_dataset_generator(library, task, model): else: raise ValueError(f"Unknown library {library}") + if task == "multiple-choice": + DATASET_SHAPES["num_choices"] = 2 + generator = DatasetGenerator(task=task, dataset_shapes=DATASET_SHAPES, model_shapes=model_shapes) generated_dataset = generator()