Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions examples/pytorch_vlm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
defaults:
- benchmark
- scenario: inference
- launcher: process
- backend: pytorch
- _base_
- _self_

name: pytorch_vlm

launcher:
device_isolation: true
device_isolation_action: warn

backend:
device: cuda
device_ids: 0
no_weights: true
torch_dtype: float16
model: Qwen/Qwen2-VL-7B-Instruct

scenario:
memory: true
latency: true

warmup_runs: 10
iterations: 10
duration: 10

input_shapes:
# text
batch_size: 1
sequence_length: 256
# image
num_images: 1
num_channels: 3
height: 224
width: 224

generate_kwargs:
max_new_tokens: 32
min_new_tokens: 32
33 changes: 15 additions & 18 deletions optimum_benchmark/backends/timm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,17 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
shapes = {}

# image input
shapes["num_channels"] = artifacts_dict.get("num_channels", None)
if shapes["num_channels"] is None:
# processors have different names for the number of channels
if "num_channels" in artifacts_dict:
shapes["num_channels"] = artifacts_dict.get("num_channels", None)
elif "channels" in artifacts_dict:
shapes["num_channels"] = artifacts_dict.get("channels", None)

image_size = artifacts_dict.get("image_size", None)
if image_size is None:
# processors have different names for the image size
image_size = artifacts_dict.get("size", None)
if "image_size" in artifacts_dict:
image_size = artifacts_dict["image_size"]
elif "size" in artifacts_dict:
image_size = artifacts_dict["size"]
else:
image_size = None

if isinstance(image_size, (int, float)):
shapes["height"] = image_size
Expand All @@ -57,24 +59,19 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
elif isinstance(image_size, dict) and len(image_size) == 1:
shapes["height"] = list(image_size.values())[0]
shapes["width"] = list(image_size.values())[0]
else:
shapes["height"] = None
shapes["width"] = None

input_size = artifacts_dict.get("input_size", None)
if input_size is not None:
if "input_size" in artifacts_dict:
input_size = artifacts_dict.get("input_size", None)
shapes["num_channels"] = input_size[0]
shapes["height"] = input_size[1]
shapes["width"] = input_size[2]

# classification labels
id2label = artifacts_dict.get("id2label", None)
if id2label is not None:
if "id2label" in artifacts_dict:
id2label = artifacts_dict["id2label"]
shapes["num_labels"] = len(id2label)

num_classes = artifacts_dict.get("num_classes", None)
if num_classes is not None:
shapes["num_labels"] = num_classes
elif "num_classes" in artifacts_dict:
shapes["num_labels"] = artifacts_dict["num_classes"]

return shapes

Expand Down
66 changes: 40 additions & 26 deletions optimum_benchmark/backends/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"image-to-text": "AutoModelForVision2Seq",
"text-generation": "AutoModelForCausalLM",
"text2text-generation": "AutoModelForSeq2SeqLM",
"image-text-to-text": "AutoModelForImageTextToText",
"visual-question-answering": "AutoModelForVisualQuestionAnswering",
"automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
}
Expand Down Expand Up @@ -125,22 +126,27 @@ def extract_transformers_shapes_from_artifacts(
shapes = {}

# text input
shapes["vocab_size"] = artifacts_dict.get("vocab_size", None)
shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", None)
shapes["max_position_embeddings"] = artifacts_dict.get("max_position_embeddings", None)
if shapes["max_position_embeddings"] is None:
shapes["max_position_embeddings"] = artifacts_dict.get("n_positions", None)
if "vocab_size" in artifacts_dict:
shapes["vocab_size"] = artifacts_dict["vocab_size"]

if "type_vocab_size" in artifacts_dict:
shapes["type_vocab_size"] = artifacts_dict["type_vocab_size"]

if "max_position_embeddings" in artifacts_dict:
shapes["max_position_embeddings"] = artifacts_dict["max_position_embeddings"]
elif "n_positions" in artifacts_dict:
shapes["max_position_embeddings"] = artifacts_dict["n_positions"]

# image input
shapes["num_channels"] = artifacts_dict.get("num_channels", None)
if shapes["num_channels"] is None:
# processors have different names for the number of channels
if "num_channels" in artifacts_dict:
shapes["num_channels"] = artifacts_dict.get("channels", None)

image_size = artifacts_dict.get("image_size", None)
if image_size is None:
# processors have different names for the image size
image_size = artifacts_dict.get("size", None)
if "image_size" in artifacts_dict:
image_size = artifacts_dict["image_size"]
elif "size" in artifacts_dict:
image_size = artifacts_dict["size"]
else:
image_size = None

if isinstance(image_size, (int, float)):
shapes["height"] = image_size
Expand All @@ -154,29 +160,37 @@ def extract_transformers_shapes_from_artifacts(
elif isinstance(image_size, dict) and len(image_size) == 1:
shapes["height"] = list(image_size.values())[0]
shapes["width"] = list(image_size.values())[0]
else:
shapes["height"] = None
shapes["width"] = None

input_size = artifacts_dict.get("input_size", None)
if input_size is not None:
if "input_size" in artifacts_dict:
input_size = artifacts_dict["input_size"]
shapes["num_channels"] = input_size[0]
shapes["height"] = input_size[1]
shapes["width"] = input_size[2]

# classification labels
id2label = artifacts_dict.get("id2label", None)
if id2label is not None:
if "id2label" in artifacts_dict:
id2label = artifacts_dict["id2label"]
shapes["num_labels"] = len(id2label)

num_classes = artifacts_dict.get("num_classes", None)
if num_classes is not None:
shapes["num_labels"] = num_classes
elif "num_classes" in artifacts_dict:
shapes["num_labels"] = artifacts_dict["num_classes"]

# object detection labels
shapes["num_queries"] = artifacts_dict.get("num_queries", None)
if shapes["num_queries"] == 0:
shapes["num_queries"] = 2
if "num_queries" in artifacts_dict:
shapes["num_queries"] = artifacts_dict["num_queries"]

# image-text input
if "image_token_id" in artifacts_dict:
shapes["image_token_id"] = artifacts_dict["image_token_id"]

if "vision_config" in artifacts_dict:
if "in_chans" in artifacts_dict["vision_config"]:
shapes["num_channels"] = artifacts_dict["vision_config"]["in_chans"]
if "patch_size" in artifacts_dict["vision_config"]:
shapes["patch_size"] = artifacts_dict["vision_config"]["patch_size"]
if "temporal_patch_size" in artifacts_dict["vision_config"]:
shapes["temporal_patch_size"] = artifacts_dict["vision_config"]["temporal_patch_size"]
if "spatial_merge_size" in artifacts_dict["vision_config"]:
shapes["spatial_merge_size"] = artifacts_dict["vision_config"]["spatial_merge_size"]

return shapes

Expand Down
Loading