44import hashlib
55from collections .abc import Mapping
66from dataclasses import field
7- from typing import Any , Literal , Optional
7+ from typing import Any , Literal , Optional , Union
88
9+ from pydantic import ConfigDict , Field , field_validator
910from pydantic .dataclasses import dataclass
1011
11- import vllm .envs as envs
1212from vllm .config .utils import config
1313
14+
15+ @dataclass
16+ class BaseDummyOptions :
17+ """Base options for generating dummy data during profiling."""
18+ count : int = Field (999 , ge = 0 )
19+
20+
21+ @dataclass (config = ConfigDict (extra = "forbid" ))
22+ class VideoDummyOptions (BaseDummyOptions ):
23+ """Options for generating dummy video data during profiling."""
24+ num_frames : Optional [int ] = Field (None , gt = 0 )
25+ width : Optional [int ] = Field (None , gt = 0 )
26+ height : Optional [int ] = Field (None , gt = 0 )
27+
28+
29+ @dataclass (config = ConfigDict (extra = "forbid" ))
30+ class ImageDummyOptions (BaseDummyOptions ):
31+ """Options for generating dummy image data during profiling."""
32+ width : Optional [int ] = Field (None , gt = 0 )
33+ height : Optional [int ] = Field (None , gt = 0 )
34+
35+
36+ @dataclass (config = ConfigDict (extra = "forbid" ))
37+ class AudioDummyOptions (BaseDummyOptions ):
38+ """Options for generating dummy audio data during profiling."""
39+ length : Optional [int ] = Field (None , gt = 0 )
40+
41+
1442MMEncoderTPMode = Literal ["weights" , "data" ]
1543MMCacheType = Literal ["shm" , "lru" ]
44+ DummyOptions = Union [BaseDummyOptions , VideoDummyOptions , ImageDummyOptions ,
45+ AudioDummyOptions ]
1646
1747
1848@config
1949@dataclass
2050class MultiModalConfig :
2151 """Controls the behavior of multimodal models."""
2252
23- limit_per_prompt : dict [str , int ] = field (default_factory = dict )
24- """The maximum number of input items allowed per prompt for each modality.
25- Defaults to 1 (V0) or 999 (V1) for each modality.
53+ limit_per_prompt : dict [str , DummyOptions ] = field (default_factory = dict )
54+ """The maximum number of input items and options allowed per
55+ prompt for each modality.
56+ Defaults to 999 for each modality.
57+
58+ Legacy format (count only):
59+ {"image": 16, "video": 2}
60+
61+ Configurable format (with options):
62+ {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
63+ "image": {"count": 5, "width": 512, "height": 512}}
2664
27- For example, to allow up to 16 images and 2 videos per prompt:
28- `{"image": 16, "video": 2}`"""
65+ Mixed format (combining both):
66+ {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
67+ "height": 512}}
68+ """
2969 media_io_kwargs : dict [str , dict [str , Any ]] = field (default_factory = dict )
3070 """Additional args passed to process media inputs, keyed by modalities.
3171 For example, to set num_frames for video, set
@@ -84,6 +124,27 @@ class MultiModalConfig:
84124 from each video to be pruned.
85125 """
86126
127+ @field_validator ("limit_per_prompt" , mode = "before" )
128+ @classmethod
129+ def _validate_limit_per_prompt (
130+ cls , value : dict [str , Union [int ,
131+ dict [str ,
132+ int ]]]) -> dict [str , DummyOptions ]:
133+ for k , v in value .items ():
134+ # Handle legacy format where only count is specified
135+ if isinstance (v , int ):
136+ v = {"count" : v }
137+ # Convert to the appropriate DummyOptions subclass
138+ if k == "video" :
139+ value [k ] = VideoDummyOptions (** v )
140+ elif k == "image" :
141+ value [k ] = ImageDummyOptions (** v )
142+ elif k == "audio" :
143+ value [k ] = AudioDummyOptions (** v )
144+ else :
145+ value [k ] = BaseDummyOptions (** v )
146+ return value
147+
87148 def compute_hash (self ) -> str :
88149 """
89150 WARNING: Whenever a new field is added to this config,
@@ -106,12 +167,22 @@ def compute_hash(self) -> str:
106167 def get_limit_per_prompt (self , modality : str ) -> int :
107168 """
108169 Get the maximum number of input items allowed per prompt
109- for the given modality.
170+ for the given modality (backward compatible).
171+ """
172+ limit_data = self .limit_per_prompt .get (modality )
173+
174+ if limit_data is None :
175+ # Unspecified modality is set to 999 by default
176+ return 999
177+ return limit_data .count
178+
179+ def get_dummy_options (self , modality : str ) -> Optional [BaseDummyOptions ]:
180+ """
181+ Get the configurable dummy data options for a modality.
182+ Returns None if no options are configured for this modality.
110183 """
111- return self .limit_per_prompt .get (
112- modality ,
113- 999 if envs .VLLM_USE_V1 else 1 ,
114- )
184+ # All values are now DummyOptions after normalization
185+ return self .limit_per_prompt .get (modality )
115186
116187 def merge_mm_processor_kwargs (
117188 self ,
0 commit comments