47
47
from vllm .attention .layer import check_upstream_fa_availability
48
48
from vllm .compilation .decorators import support_torch_compile
49
49
from vllm .config import VllmConfig
50
- from vllm .config .multimodal import BaseDummyOptions
50
+ from vllm .config .multimodal import BaseDummyOptions , VideoDummyOptions
51
51
from vllm .distributed import get_pp_group
52
52
from vllm .logger import init_logger
53
53
from vllm .model_executor .layers .activation import _ACTIVATION_REGISTRY
@@ -741,20 +741,57 @@ def get_dummy_mm_data(
741
741
) -> MultiModalDataDict :
742
742
num_images = mm_counts .get ("image" , 0 )
743
743
num_videos = mm_counts .get ("video" , 0 )
744
+ image_overrides = mm_options .get ("image" ) if mm_options else None
745
+ video_overrides = mm_options .get ("video" ) if mm_options else None
744
746
745
747
target_width , target_height = (
746
748
self .info .get_image_size_with_most_features ())
747
749
target_num_frames = self .info .get_num_frames_with_most_features (
748
750
seq_len , mm_counts )
751
+
752
+ if video_overrides :
753
+ assert isinstance (video_overrides , VideoDummyOptions )
754
+ num_frames_override = video_overrides .num_frames
755
+ if num_frames_override :
756
+ if num_frames_override > target_num_frames :
757
+ logger .warning (
758
+ "video.num_frames override (%d) exceeds model's "
759
+ "maximum number of frames (%d), will be ignored" ,
760
+ num_frames_override , target_num_frames )
761
+ if num_frames_override < 2 :
762
+ logger .warning (
763
+ "video.num_frames override (%d) cannot be less "
764
+ "than 2, will be ignored" , num_frames_override )
765
+ target_num_frames = min (target_num_frames , num_frames_override )
766
+ target_num_frames = max (target_num_frames , 2 )
767
+
749
768
target_video_size , _ = self .info ._get_vision_info (
750
769
image_width = target_width ,
751
770
image_height = target_height ,
752
771
num_frames = target_num_frames ,
753
772
image_processor = self .info .get_video_processor (),
754
773
)
755
-
756
- image_overrides = mm_options .get ("image" ) if mm_options else None
757
- video_overrides = mm_options .get ("video" ) if mm_options else None
774
+ # NOTE: we need to do this check here since Qwen3-VL resizes video
775
+ # frames depending on how many frames there are.
776
+ width , height = target_video_size .width , target_video_size .height
777
+ if video_overrides :
778
+ assert isinstance (video_overrides , VideoDummyOptions )
779
+ width_override = video_overrides .width
780
+ if width_override :
781
+ if width_override > width :
782
+ logger .warning (
783
+ "video.width override (%d) exceeds model's "
784
+ "maximum width (%d), will be ignored" , width_override ,
785
+ width )
786
+ width = min (width , width_override )
787
+ height_override = video_overrides .height
788
+ if height_override :
789
+ if height_override > height :
790
+ logger .warning (
791
+ "video.height override (%d) exceeds model's "
792
+ "maximum height (%d), will be ignored" ,
793
+ height_override , height )
794
+ height = min (height , height_override )
758
795
759
796
return {
760
797
"image" :
@@ -764,11 +801,10 @@ def get_dummy_mm_data(
764
801
overrides = image_overrides ),
765
802
"video" :
766
803
self ._get_dummy_videos (
767
- width = target_video_size . width ,
768
- height = target_video_size . height ,
804
+ width = width ,
805
+ height = height ,
769
806
num_frames = target_num_frames ,
770
807
num_videos = num_videos ,
771
- overrides = video_overrides ,
772
808
),
773
809
}
774
810
@@ -780,7 +816,6 @@ def _get_dummy_videos(
780
816
num_frames : int ,
781
817
num_videos : int ,
782
818
) -> list [VideoItem ]:
783
- num_frames = max (num_frames , 2 )
784
819
video = np .full ((num_frames , width , height , 3 ), 255 , dtype = np .uint8 )
785
820
video_items = []
786
821
for i in range (num_videos ):
@@ -796,18 +831,6 @@ def _get_dummy_videos(
796
831
video_items .append (video_item )
797
832
return video_items
798
833
799
- def get_dummy_processor_inputs (self , seq_len , mm_counts ):
800
- processor_inputs = super ().get_dummy_processor_inputs (
801
- seq_len , mm_counts )
802
- # HACK(Isotr0py): We set do_resize to False here to reuse Qwen2-VL's
803
- # profiling logic, which will be problematic for configurable mm
804
- # profiling.
805
- # TODO(Isotr0py): Switch to the implementation in
806
- # https://github.com/vllm-project/vllm/pull/25557
807
- # after supporting configurable mm profiling.
808
- processor_inputs .hf_processor_mm_kwargs = {"do_resize" : False }
809
- return processor_inputs
810
-
811
834
812
835
class Qwen3VLMultiModalProcessor (BaseMultiModalProcessor [Qwen3VLProcessingInfo ]
813
836
):
0 commit comments