Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions docs/contributing/model/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,21 +293,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

target_width, target_height = \
self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
"image": self._get_dummy_images(
width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides,
)
}
```

Expand Down Expand Up @@ -479,17 +480,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
target_width, target_height = \
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image":
self._get_dummy_images(
"image": self._get_dummy_images(
width=target_width,
height=target_height,
num_images=num_images,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_dummy_data_generation(mock_ctx):
builder = AudioFlamingo3DummyInputsBuilder(info)

mm_counts = {"audio": 2}
dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
dummy_data = builder.get_dummy_mm_data(100, mm_counts, {})

assert "audio" in dummy_data
assert len(dummy_data["audio"]) == 2
Expand Down
56 changes: 36 additions & 20 deletions vllm/config/multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Mapping
from typing import Any, Literal, TypeAlias
from typing import Any, Literal, TypeAlias, TypedDict, final

from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
Expand Down Expand Up @@ -43,11 +43,29 @@ class AudioDummyOptions(BaseDummyOptions):
length: int | None = Field(None, gt=0)


@final
class MultiModalDummyOptionsBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM."""

image: ImageDummyOptions
"""Options for dummy images."""

video: VideoDummyOptions
"""Options for dummy videos."""

audio: AudioDummyOptions
"""Options for dummy audios."""


MMEncoderTPMode = Literal["weights", "data"]
MMCacheType = Literal["shm", "lru"]
DummyOptions: TypeAlias = (
BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
)
MMDummyOptions: TypeAlias = dict[str, BaseDummyOptions]
"""
A dictionary containing an entry for each modality type of dummy data.
The built-in modalities are defined by
[`MultiModalDummyOptionsBuiltins`][vllm.config.multimodal.MultiModalDummyOptionsBuiltins].
"""


@config
Expand All @@ -57,7 +75,7 @@ class MultiModalConfig:
language_model_only: bool = False
"""If True, disables all multimodal inputs by setting all modality limits to 0.
Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality."""
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
limit_per_prompt: MMDummyOptions = Field(default_factory=dict)
"""The maximum number of input items and options allowed per
prompt for each modality.
Expand Down Expand Up @@ -158,22 +176,27 @@ class MultiModalConfig:
@field_validator("limit_per_prompt", mode="before")
@classmethod
def _validate_limit_per_prompt(
cls, value: dict[str, int | dict[str, int]]
) -> dict[str, DummyOptions]:
cls,
value: dict[str, int | dict[str, int]],
) -> MMDummyOptions:
out: MMDummyOptions = {}

for k, v in value.items():
# Handle legacy format where only count is specified
if isinstance(v, int):
v = {"count": v}

# Convert to the appropriate DummyOptions subclass
if k == "video":
value[k] = VideoDummyOptions(**v)
out[k] = VideoDummyOptions(**v)
elif k == "image":
value[k] = ImageDummyOptions(**v)
out[k] = ImageDummyOptions(**v)
elif k == "audio":
value[k] = AudioDummyOptions(**v)
out[k] = AudioDummyOptions(**v)
else:
value[k] = BaseDummyOptions(**v)
return value
out[k] = BaseDummyOptions(**v)

return out

@field_validator("mm_encoder_attn_backend", mode="before")
@classmethod
Expand Down Expand Up @@ -240,15 +263,8 @@ def get_limit_per_prompt(self, modality: str) -> int:
if limit_data is None:
# Unspecified modality is set to 999 by default
return 999
return limit_data.count

def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
"""
Get the configurable dummy data options for a modality.
Returns None if no options are configured for this modality.
"""
# All values are now DummyOptions after normalization
return self.limit_per_prompt.get(modality)
return limit_data.count

def merge_mm_processor_kwargs(
self,
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,14 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
vision_config = self.info.get_vision_config()

max_image_size = vision_config.image_size
num_images = mm_counts.get("image", 0)

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
9 changes: 3 additions & 6 deletions vllm/model_executor/models/audioflamingo3.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,16 +252,13 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
feature_extractor = self.info.get_feature_extractor(
**(mm_processor_kwargs or {})
)
feature_extractor = self.info.get_feature_extractor()
sampling_rate = feature_extractor.sampling_rate
audio_len = MAX_AUDIO_LEN * sampling_rate
num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None
audio_overrides = mm_options.get("audio")

return {
"audio": self._get_dummy_audios(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,12 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
image_size = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/bagel.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,15 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
hf_config = self.info.get_hf_config()
vit_config = hf_config.vit_config

# Use the configured image size
image_size = vit_config.image_size
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/bee.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,13 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

target_width, target_height = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,16 +445,15 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config

max_image_size = vision_config.image_size
num_images = mm_counts.get("image", 0)

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,14 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
config = self.info.get_hf_config()

width = height = config.vq_config.resolution
num_images = mm_counts.get("image", 0)

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,13 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

target_width, target_height = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/cohere2_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,13 +197,12 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
image_size = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
3 changes: 1 addition & 2 deletions vllm/model_executor/models/deepseek_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,7 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

Expand Down
3 changes: 1 addition & 2 deletions vllm/model_executor/models/deepseek_ocr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,13 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

max_image_size = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
10 changes: 3 additions & 7 deletions vllm/model_executor/models/dots_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,13 @@ def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

mm_processor_kwargs = mm_processor_kwargs or {}
target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501
mm_processor_kwargs.get("max_pixels", None)
)
target_width, target_height = self.info.get_image_size_with_most_features()

image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")

return {
"image": self._get_dummy_images(
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/eagle2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model."""

def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
return self.ctx.init_processor(
return self.ctx.get_hf_processor(
Eagle2_5_VLProcessor,
config=self.ctx.get_hf_config(),
tokenizer=self.get_tokenizer(),
Expand Down
Loading