Skip to content

Commit 67bc0c0

Browse files
authored
[Bugfix] Fix qwen3 vl dummy data generation with overrides (vllm-project#26193)
Signed-off-by: Roger Wang <[email protected]>
1 parent 5a05f26 commit 67bc0c0

File tree

1 file changed

+43
-20
lines changed

1 file changed

+43
-20
lines changed

vllm/model_executor/models/qwen3_vl.py

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
from vllm.attention.layer import check_upstream_fa_availability
4848
from vllm.compilation.decorators import support_torch_compile
4949
from vllm.config import VllmConfig
50-
from vllm.config.multimodal import BaseDummyOptions
50+
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
5151
from vllm.distributed import get_pp_group
5252
from vllm.logger import init_logger
5353
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
@@ -741,20 +741,57 @@ def get_dummy_mm_data(
741741
) -> MultiModalDataDict:
742742
num_images = mm_counts.get("image", 0)
743743
num_videos = mm_counts.get("video", 0)
744+
image_overrides = mm_options.get("image") if mm_options else None
745+
video_overrides = mm_options.get("video") if mm_options else None
744746

745747
target_width, target_height = (
746748
self.info.get_image_size_with_most_features())
747749
target_num_frames = self.info.get_num_frames_with_most_features(
748750
seq_len, mm_counts)
751+
752+
if video_overrides:
753+
assert isinstance(video_overrides, VideoDummyOptions)
754+
num_frames_override = video_overrides.num_frames
755+
if num_frames_override:
756+
if num_frames_override > target_num_frames:
757+
logger.warning(
758+
"video.num_frames override (%d) exceeds model's "
759+
"maximum number of frames (%d), will be ignored",
760+
num_frames_override, target_num_frames)
761+
if num_frames_override < 2:
762+
logger.warning(
763+
"video.num_frames override (%d) cannot be less "
764+
"than 2, will be ignored", num_frames_override)
765+
target_num_frames = min(target_num_frames, num_frames_override)
766+
target_num_frames = max(target_num_frames, 2)
767+
749768
target_video_size, _ = self.info._get_vision_info(
750769
image_width=target_width,
751770
image_height=target_height,
752771
num_frames=target_num_frames,
753772
image_processor=self.info.get_video_processor(),
754773
)
755-
756-
image_overrides = mm_options.get("image") if mm_options else None
757-
video_overrides = mm_options.get("video") if mm_options else None
774+
# NOTE: we need to do this check here since Qwen3-VL resizes video
775+
# frames depending on how many frames there are.
776+
width, height = target_video_size.width, target_video_size.height
777+
if video_overrides:
778+
assert isinstance(video_overrides, VideoDummyOptions)
779+
width_override = video_overrides.width
780+
if width_override:
781+
if width_override > width:
782+
logger.warning(
783+
"video.width override (%d) exceeds model's "
784+
"maximum width (%d), will be ignored", width_override,
785+
width)
786+
width = min(width, width_override)
787+
height_override = video_overrides.height
788+
if height_override:
789+
if height_override > height:
790+
logger.warning(
791+
"video.height override (%d) exceeds model's "
792+
"maximum height (%d), will be ignored",
793+
height_override, height)
794+
height = min(height, height_override)
758795

759796
return {
760797
"image":
@@ -764,11 +801,10 @@ def get_dummy_mm_data(
764801
overrides=image_overrides),
765802
"video":
766803
self._get_dummy_videos(
767-
width=target_video_size.width,
768-
height=target_video_size.height,
804+
width=width,
805+
height=height,
769806
num_frames=target_num_frames,
770807
num_videos=num_videos,
771-
overrides=video_overrides,
772808
),
773809
}
774810

@@ -780,7 +816,6 @@ def _get_dummy_videos(
780816
num_frames: int,
781817
num_videos: int,
782818
) -> list[VideoItem]:
783-
num_frames = max(num_frames, 2)
784819
video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
785820
video_items = []
786821
for i in range(num_videos):
@@ -796,18 +831,6 @@ def _get_dummy_videos(
796831
video_items.append(video_item)
797832
return video_items
798833

799-
def get_dummy_processor_inputs(self, seq_len, mm_counts):
800-
processor_inputs = super().get_dummy_processor_inputs(
801-
seq_len, mm_counts)
802-
# HACK(Isotr0py): We set do_resize to False here to reuse Qwen2-VL's
803-
# profiling logic, which will be problematic for configurable mm
804-
# profiling.
805-
# TODO(Isotr0py): Switch to the implementation in
806-
# https://github.com/vllm-project/vllm/pull/25557
807-
# after supporting configurable mm profiling.
808-
processor_inputs.hf_processor_mm_kwargs = {"do_resize": False}
809-
return processor_inputs
810-
811834

812835
class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]
813836
):

0 commit comments

Comments
 (0)