Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 5b59532

Browse files
litianjianlitianjianDarkLight1337ywang96
authored
[Model][VLM] Add LLaVA-Onevision model support (vllm-project#8486)
Co-authored-by: litianjian <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Roger Wang <[email protected]> Co-authored-by: DarkLight1337 <[email protected]>
1 parent ca2b628 commit 5b59532

File tree

10 files changed

+1330
-21
lines changed

10 files changed

+1330
-21
lines changed

docs/source/models/supported_models.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,11 @@ Multimodal Language Models
244244
- Video
245245
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
246246
-
247+
* - :code:`LlavaOnevisionForConditionalGeneration`
248+
- LLaVA-Onevision
249+
- Image\ :sup:`+` / Video
250+
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. (see note)
251+
-
247252
* - :code:`MiniCPMV`
248253
- MiniCPM-V
249254
- Image\ :sup:`+`
@@ -288,7 +293,7 @@ Multimodal Language Models
288293
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
289294

290295
.. note::
291-
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
296+
For :code:`LLaVA-NeXT-Video`, :code:`LLaVA-Onevision` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
292297
This can be installed by running the following command:
293298

294299
.. code-block:: bash

examples/offline_inference_vision_language.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515

1616
# LLaVA-1.5
17-
def run_llava(question):
17+
def run_llava(question, modality):
18+
assert modality == "image"
1819

1920
prompt = f"USER: <image>\n{question}\nASSISTANT:"
2021

@@ -24,7 +25,8 @@ def run_llava(question):
2425

2526

2627
# LLaVA-1.6/LLaVA-NeXT
27-
def run_llava_next(question):
28+
def run_llava_next(question, modality):
29+
assert modality == "image"
2830

2931
prompt = f"[INST] <image>\n{question} [/INST]"
3032
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
@@ -34,15 +36,35 @@ def run_llava_next(question):
3436

3537
# LlaVA-NeXT-Video
3638
# Currently only support for video input
37-
def run_llava_next_video(question):
39+
def run_llava_next_video(question, modality):
40+
assert modality == "video"
41+
3842
prompt = f"USER: <video>\n{question} ASSISTANT:"
3943
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
4044
stop_token_ids = None
4145
return llm, prompt, stop_token_ids
4246

4347

48+
# LLaVA-OneVision
49+
def run_llava_onevision(question, modality):
50+
51+
if modality == "video":
52+
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
53+
<|im_start|>assistant\n"
54+
55+
elif modality == "image":
56+
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
57+
<|im_start|>assistant\n"
58+
59+
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
60+
max_model_len=32768)
61+
stop_token_ids = None
62+
return llm, prompt, stop_token_ids
63+
64+
4465
# Fuyu
45-
def run_fuyu(question):
66+
def run_fuyu(question, modality):
67+
assert modality == "image"
4668

4769
prompt = f"{question}\n"
4870
llm = LLM(model="adept/fuyu-8b")
@@ -51,7 +73,8 @@ def run_fuyu(question):
5173

5274

5375
# Phi-3-Vision
54-
def run_phi3v(question):
76+
def run_phi3v(question, modality):
77+
assert modality == "image"
5578

5679
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
5780
# Note: The default setting of max_num_seqs (256) and
@@ -70,7 +93,8 @@ def run_phi3v(question):
7093

7194

7295
# PaliGemma
73-
def run_paligemma(question):
96+
def run_paligemma(question, modality):
97+
assert modality == "image"
7498

7599
# PaliGemma has special prompt format for VQA
76100
prompt = "caption en"
@@ -80,7 +104,8 @@ def run_paligemma(question):
80104

81105

82106
# Chameleon
83-
def run_chameleon(question):
107+
def run_chameleon(question, modality):
108+
assert modality == "image"
84109

85110
prompt = f"{question}<image>"
86111
llm = LLM(model="facebook/chameleon-7b")
@@ -89,7 +114,8 @@ def run_chameleon(question):
89114

90115

91116
# MiniCPM-V
92-
def run_minicpmv(question):
117+
def run_minicpmv(question, modality):
118+
assert modality == "image"
93119

94120
# 2.0
95121
# The official repo doesn't work yet, so we need to use a fork for now
@@ -129,7 +155,9 @@ def run_minicpmv(question):
129155

130156

131157
# InternVL
132-
def run_internvl(question):
158+
def run_internvl(question, modality):
159+
assert modality == "image"
160+
133161
model_name = "OpenGVLab/InternVL2-2B"
134162

135163
llm = LLM(
@@ -155,7 +183,8 @@ def run_internvl(question):
155183

156184

157185
# BLIP-2
158-
def run_blip2(question):
186+
def run_blip2(question, modality):
187+
assert modality == "image"
159188

160189
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
161190
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
@@ -166,7 +195,8 @@ def run_blip2(question):
166195

167196

168197
# Qwen
169-
def run_qwen_vl(question):
198+
def run_qwen_vl(question, modality):
199+
assert modality == "image"
170200

171201
llm = LLM(
172202
model="Qwen/Qwen-VL",
@@ -180,7 +210,9 @@ def run_qwen_vl(question):
180210

181211

182212
# Qwen2-VL
183-
def run_qwen2_vl(question):
213+
def run_qwen2_vl(question, modality):
214+
assert modality == "image"
215+
184216
model_name = "Qwen/Qwen2-VL-7B-Instruct"
185217

186218
llm = LLM(
@@ -200,6 +232,7 @@ def run_qwen2_vl(question):
200232
"llava": run_llava,
201233
"llava-next": run_llava_next,
202234
"llava-next-video": run_llava_next_video,
235+
"llava-onevision": run_llava_onevision,
203236
"fuyu": run_fuyu,
204237
"phi3_v": run_phi3v,
205238
"paligemma": run_paligemma,
@@ -255,7 +288,7 @@ def main(args):
255288
data = mm_input["data"]
256289
question = mm_input["question"]
257290

258-
llm, prompt, stop_token_ids = model_example_map[model](question)
291+
llm, prompt, stop_token_ids = model_example_map[model](question, modality)
259292

260293
# We set temperature to 0.2 so that outputs can be different
261294
# even when all prompts are identical when running batch inference.
@@ -306,6 +339,7 @@ def main(args):
306339
parser.add_argument('--modality',
307340
type=str,
308341
default="image",
342+
choices=['image', 'video'],
309343
help='Modality of the input.')
310344
parser.add_argument('--num-frames',
311345
type=int,

tests/models/decoder_only/vision_language/test_llava_next_video.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,6 @@ def run_test(
105105
for asset in video_assets
106106
]
107107

108-
for video in videos:
109-
print(video.shape)
110-
111108
if size_factors is not None:
112109
inputs_per_video = [(
113110
[prompt for _ in size_factors],

0 commit comments

Comments
 (0)