Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 3b7fea7

Browse files
fyabcywang96DarkLight1337
authored
[Model][VLM] Add Qwen2-VL model support (vllm-project#7905)
Co-authored-by: Roger Wang <[email protected]> Co-authored-by: DarkLight1337 <[email protected]>
1 parent cea95df commit 3b7fea7

File tree

14 files changed

+1531
-31
lines changed

14 files changed

+1531
-31
lines changed

docs/source/models/supported_models.rst

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,11 @@ Multimodal Language Models
252252
- Image\ :sup:`E`
253253
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
254254
-
255+
* - :code:`Qwen2VLForConditionalGeneration`
256+
- Qwen2-VL (see note)
257+
- Image\ :sup:`+` / Video\ :sup:`+`
258+
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
259+
-
255260
* - :code:`UltravoxModel`
256261
- Ultravox
257262
- Audio\ :sup:`E+`
@@ -265,15 +270,14 @@ Multimodal Language Models
265270
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
266271
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
267272

268-
For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
273+
.. note::
274+
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
269275
This can be installed by running the following command:
270276

271-
272277
.. code-block:: bash
273278
274279
pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
275280
276-
277281
----
278282

279283
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

examples/offline_inference_vision_language.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,23 @@ def run_qwen_vl(question):
179179
return llm, prompt, stop_token_ids
180180

181181

182+
# Qwen2-VL
183+
def run_qwen2_vl(question):
184+
model_name = "Qwen/Qwen2-VL-7B-Instruct"
185+
186+
llm = LLM(
187+
model=model_name,
188+
max_num_seqs=5,
189+
)
190+
191+
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
192+
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
193+
f"{question}<|im_end|>\n"
194+
"<|im_start|>assistant\n")
195+
stop_token_ids = None
196+
return llm, prompt, stop_token_ids
197+
198+
182199
model_example_map = {
183200
"llava": run_llava,
184201
"llava-next": run_llava_next,
@@ -191,6 +208,7 @@ def run_qwen_vl(question):
191208
"blip-2": run_blip2,
192209
"internvl_chat": run_internvl,
193210
"qwen_vl": run_qwen_vl,
211+
"qwen2_vl": run_qwen2_vl,
194212
}
195213

196214

examples/offline_inference_vision_language_multi_image.py

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from argparse import Namespace
77
from typing import List
88

9-
from transformers import AutoTokenizer
9+
from transformers import AutoProcessor, AutoTokenizer
1010

1111
from vllm import LLM, SamplingParams
1212
from vllm.multimodal.utils import fetch_image
@@ -30,7 +30,7 @@ def load_phi3v(question, image_urls: List[str]):
3030
for i, _ in enumerate(image_urls, start=1))
3131
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
3232
stop_token_ids = None
33-
return llm, prompt, stop_token_ids
33+
return llm, prompt, stop_token_ids, None
3434

3535

3636
def load_internvl(question, image_urls: List[str]):
@@ -60,18 +60,72 @@ def load_internvl(question, image_urls: List[str]):
6060
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
6161
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
6262
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
63-
return llm, prompt, stop_token_ids
63+
64+
return llm, prompt, stop_token_ids, None
65+
66+
67+
def load_qwen2_vl(question, image_urls: List[str]):
68+
try:
69+
from qwen_vl_utils import process_vision_info
70+
except ModuleNotFoundError:
71+
print('WARNING: `qwen-vl-utils` not installed, input images will not '
72+
'be automatically resized. You can enable this functionality by '
73+
'`pip install qwen-vl-utils`.')
74+
process_vision_info = None
75+
76+
model_name = "Qwen/Qwen2-VL-7B-Instruct"
77+
78+
llm = LLM(
79+
model=model_name,
80+
max_num_seqs=5,
81+
max_model_len=32768 if process_vision_info is None else 4096,
82+
limit_mm_per_prompt={"image": len(image_urls)},
83+
)
84+
85+
placeholders = [{"type": "image", "image": url} for url in image_urls]
86+
messages = [{
87+
"role": "system",
88+
"content": "You are a helpful assistant."
89+
}, {
90+
"role":
91+
"user",
92+
"content": [
93+
*placeholders,
94+
{
95+
"type": "text",
96+
"text": question
97+
},
98+
],
99+
}]
100+
101+
processor = AutoProcessor.from_pretrained(model_name)
102+
103+
prompt = processor.apply_chat_template(messages,
104+
tokenize=False,
105+
add_generation_prompt=True)
106+
107+
stop_token_ids = None
108+
109+
if process_vision_info is None:
110+
image_data = [fetch_image(url) for url in image_urls]
111+
else:
112+
image_data, _ = process_vision_info(messages)
113+
114+
return llm, prompt, stop_token_ids, image_data
64115

65116

66117
model_example_map = {
67118
"phi3_v": load_phi3v,
68119
"internvl_chat": load_internvl,
120+
"qwen2_vl": load_qwen2_vl,
69121
}
70122

71123

72124
def run_generate(model, question: str, image_urls: List[str]):
73-
llm, prompt, stop_token_ids = model_example_map[model](question,
74-
image_urls)
125+
llm, prompt, stop_token_ids, image_data = model_example_map[model](
126+
question, image_urls)
127+
if image_data is None:
128+
image_data = [fetch_image(url) for url in image_urls]
75129

76130
sampling_params = SamplingParams(temperature=0.0,
77131
max_tokens=128,
@@ -81,7 +135,7 @@ def run_generate(model, question: str, image_urls: List[str]):
81135
{
82136
"prompt": prompt,
83137
"multi_modal_data": {
84-
"image": [fetch_image(url) for url in image_urls]
138+
"image": image_data
85139
},
86140
},
87141
sampling_params=sampling_params)
@@ -92,7 +146,7 @@ def run_generate(model, question: str, image_urls: List[str]):
92146

93147

94148
def run_chat(model: str, question: str, image_urls: List[str]):
95-
llm, _, stop_token_ids = model_example_map[model](question, image_urls)
149+
llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
96150

97151
sampling_params = SamplingParams(temperature=0.0,
98152
max_tokens=128,

requirements-common.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ importlib_metadata
2828
mistral_common >= 1.3.4
2929
pyyaml
3030
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
31+
einops # Required for Qwen2-VL.

tests/models/test_registry.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import pytest
2+
import transformers
23

34
from vllm.model_executor.models import _MODELS, ModelRegistry
45

56

67
@pytest.mark.parametrize("model_cls", _MODELS)
78
def test_registry_imports(model_cls):
9+
if (model_cls == "Qwen2VLForConditionalGeneration"
10+
and transformers.__version__ < "4.45"):
11+
pytest.skip("Waiting for next transformers release")
12+
813
# Ensure all model classes can be imported successfully
914
ModelRegistry.resolve_model_cls([model_cls])

vllm/config.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -773,7 +773,7 @@ class LoadConfig:
773773
ignore_patterns: The list of patterns to ignore when loading the model.
774774
Default to "original/**/*" to avoid repeated loading of llama's
775775
checkpoints.
776-
776+
777777
"""
778778

779779
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -1733,8 +1733,11 @@ def _get_and_verify_max_len(
17331733
"with rope_scaling. Please raise an issue so we can "
17341734
"investigate.")
17351735

1736-
assert "factor" in rope_scaling
1737-
scaling_factor = rope_scaling["factor"]
1736+
if rope_type == "mrope":
1737+
scaling_factor = 1
1738+
else:
1739+
assert "factor" in rope_scaling
1740+
scaling_factor = rope_scaling["factor"]
17381741
if rope_type == "yarn":
17391742
derived_max_model_len = rope_scaling[
17401743
"original_max_position_embeddings"]

vllm/entrypoints/chat_utils.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class ConversationMessage(TypedDict, total=False):
108108
"""The tool calls generated by the model, such as function calls."""
109109

110110

111-
ModalityStr = Literal["image", "audio"]
111+
ModalityStr = Literal["image", "audio", "video"]
112112
_T = TypeVar("_T")
113113

114114

@@ -158,12 +158,18 @@ def _placeholder_str(self, modality: ModalityStr,
158158
hf_config.image_token_index)
159159
if model_type in ("chameleon", "internvl_chat"):
160160
return "<image>"
161+
if model_type == "qwen2_vl":
162+
return "<|vision_start|><|image_pad|><|vision_end|>"
161163

162164
raise TypeError(f"Unknown model type: {model_type}")
163165
elif modality == "audio":
164166
if model_type == "ultravox":
165167
return "<|reserved_special_token_0|>"
166168
raise TypeError(f"Unknown model type: {model_type}")
169+
elif modality == "video":
170+
if model_type == "qwen2_vl":
171+
return "<|vision_start|><|video_pad|><|vision_end|>"
172+
raise TypeError(f"Unknown model type: {model_type}")
167173
else:
168174
raise TypeError(f"Unknown modality: {modality}")
169175

0 commit comments

Comments
 (0)