Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,10 @@ Specified using `--task generate`.
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B`
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | | ✅︎ | ✅︎\* |
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ |
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
Expand Down
122 changes: 121 additions & 1 deletion examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,116 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)


# Qwen3-VL-Dense
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-4B-Instruct"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)

if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"

prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# Qwen3-VL-MOE
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)

if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"

prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


def run_qwen3_omni_moe(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)

if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"

prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -1146,11 +1256,20 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"qwen2_5_omni": run_qwen2_5_omni,
"qwen3_vl": run_qwen3_vl,
"qwen3_vl_moe": run_qwen3_vl_moe,
"qwen3_omni_moe": run_qwen3_omni_moe,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
"tarsier": run_tarsier,
}

MODELS_NEED_VIDEO_METADATA = [
"glm4_1v",
"qwen3_vl",
"qwen3_vl_moe",
]


def get_multi_modal_input(args):
"""
Expand All @@ -1176,12 +1295,13 @@ def get_multi_modal_input(args):

if args.modality == "video":
# Input video and question
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
vid_questions = ["Why is this video funny?"]

return {
"data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
"data": [(video, metadata)] if needs_metadata else video,
"questions": vid_questions,
}

Expand Down
34 changes: 33 additions & 1 deletion tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
# Ensure video metadata is included
if "video" in mm_data:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
mm_data["video"] = (video, {
"total_num_frames": len(video),
Expand All @@ -39,6 +40,33 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
})
return mm_data

def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for Qwen3-VL model.
"""

def create_metadata(frames: np.ndarray):
num_frames = len(frames)
return {
"total_num_frames": num_frames,
"fps": 2.0,
"duration": num_frames / 2.0,
"video_backend": "opencv",
"frames_indices": list(range(num_frames)),
"do_sample_frames": True,
}

# Ensure video metadata is included
if "video" in mm_data:
video = mm_data["video"]
if isinstance(video, list):
# multiple videos
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
else:
# single video
mm_data["video"] = (video, create_metadata(video))
return mm_data


def _test_processing_correctness(
model_id: str,
Expand Down Expand Up @@ -171,8 +199,10 @@ def _test_processing_correctness(
}

MM_DATA_PATCHES = {
# GLM4.1V requires video metadata to be included in the input
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
"glm4v": glm4_1v_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
}


Expand Down Expand Up @@ -304,6 +334,8 @@ def _test_processing_correctness_one(
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen3-VL-4B-Instruct",
"Qwen/Qwen3-VL-30B-A3B-Instruct",
"Skywork/Skywork-R1V-38B",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
Expand Down
9 changes: 9 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,15 @@ def check_available_online(
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501
"Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
max_model_len=4096,
min_transformers_version="4.57"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
max_model_len=4096,
min_transformers_version="4.57"),
"Qwen3OmniMoeThinkerForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-Omni-30B-A3B-Instruct", # noqa: E501
max_model_len=4096,
min_transformers_version="4.57"),
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
Expand Down
13 changes: 10 additions & 3 deletions vllm/assets/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def video_to_pil_images_list(path: str,
]


def video_get_metadata(path: str) -> dict[str, Any]:
def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
cap = cv2.VideoCapture(path)
if not cap.isOpened():
raise ValueError(f"Could not open video file {path}")
Expand All @@ -86,11 +86,18 @@ def video_get_metadata(path: str) -> dict[str, Any]:
fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames / fps if fps > 0 else 0

if num_frames == -1 or num_frames > total_frames:
num_frames = total_frames

metadata = {
"total_num_frames": total_frames,
"fps": fps,
"duration": duration,
"video_backend": "opencv"
"video_backend": "opencv",
"frames_indices": list(range(num_frames)),
# extra field used to control hf processor's video
# sampling behavior
"do_sample_frames": num_frames == total_frames,
}
return metadata

Expand Down Expand Up @@ -126,7 +133,7 @@ def np_ndarrays(self) -> npt.NDArray:
@property
def metadata(self) -> dict[str, Any]:
video_path = download_video_asset(self.filename)
ret = video_get_metadata(video_path)
ret = video_get_metadata(video_path, self.num_frames)
return ret

def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
Expand Down
8 changes: 8 additions & 0 deletions vllm/attention/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
from vllm.v1.attention.backends.utils import validate_kv_sharing_target


def check_upstream_fa_availability(dtype: torch.dtype):
if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
) and current_platform.has_device_capability(80):
from transformers.utils import is_flash_attn_2_available
return is_flash_attn_2_available()
return False


class Attention(nn.Module):
"""Attention layer.
Expand Down
6 changes: 6 additions & 0 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ def _placeholder_str(self, modality: ModalityStr,
return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "qwen2_5_omni":
return "<|vision_start|><|IMAGE|><|vision_end|>"
if model_type in ("qwen3_omni_moe", "qwen3_vl_moe", "qwen3_vl"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "molmo":
return ""
if model_type == "aria":
Expand All @@ -555,6 +557,8 @@ def _placeholder_str(self, modality: ModalityStr,
if model_type in ("qwen2_audio", "qwen2_5_omni"):
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
if model_type == "qwen3_omni_moe":
return "<|audio_start|><|audio_pad|><|audio_end|>"
if model_type == "minicpmo":
return "(<audio>./</audio>)"
raise TypeError(f"Unknown model type: {model_type}")
Expand All @@ -567,6 +571,8 @@ def _placeholder_str(self, modality: ModalityStr,
return "<|vision_start|><|video_pad|><|vision_end|>"
if model_type == "qwen2_5_omni":
return "<|vision_start|><|VIDEO|><|vision_end|>"
if model_type in ("qwen3_omni_moe", "qwen3_vl_moe", "qwen3_vl"):
return "<|vision_start|><|video_pad|><|vision_end|>"
if model_type in ("minicpmo", "minicpmv"):
return "(<video>./</video>)"
if model_type.startswith("llava"):
Expand Down
Loading