Skip to content

Commit f8ded0c

Browse files
committed
fix and test
Signed-off-by: Dong Wang <dongw2019@gmail.com>
1 parent c5c0d26 commit f8ded0c

File tree

10 files changed

+733
-218
lines changed

10 files changed

+733
-218
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ Some models are supported only via the [Transformers modeling backend](#transfor
734734

735735
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
736736
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
737-
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
737+
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
738738

739739
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
740740

tests/models/multimodal/generation/test_common.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,13 +710,33 @@
710710
),
711711
"moondream3": VLMTestInfo(
712712
models=["moondream/moondream3-preview"],
713-
test_type=VLMTestType.IMAGE,
713+
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
714714
prompt_formatter=lambda img_prompt: f"<|endoftext|>{img_prompt}",
715715
# Note: space after <image> is required for correct tokenization
716716
img_idx_to_prompt=lambda idx: "<image> \n\n",
717+
# Moondream3-specific prompts to test query and caption skills
718+
single_image_prompts=IMAGE_ASSETS.prompts(
719+
{
720+
"stop_sign": "Question: What is this sign?\n\nAnswer:",
721+
"cherry_blossom": "Question: What season is shown?\n\nAnswer:",
722+
}
723+
),
717724
max_model_len=2048,
718725
max_num_seqs=2,
719726
dtype="bfloat16",
727+
patch_hf_runner=model_utils.moondream3_patch_hf_runner,
728+
hf_model_kwargs={"trust_remote_code": True},
729+
# Custom inputs to test all Moondream3 skills
730+
custom_test_opts=[
731+
CustomTestOptions(
732+
inputs=custom_inputs.moondream3_skill_inputs(),
733+
limit_mm_per_prompt={"image": 1},
734+
),
735+
CustomTestOptions(
736+
inputs=custom_inputs.moondream3_multi_size_inputs(),
737+
limit_mm_per_prompt={"image": 1},
738+
),
739+
],
720740
# Moondream3 is 9B params with MoE, needs significant GPU memory
721741
marks=[large_gpu_mark(min_gb=48)],
722742
),
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""Standalone generation tests for Moondream3 model.
4+
5+
These tests verify end-to-end inference capabilities including:
6+
- Basic model loading and generation
7+
- Multi-skill support (Query, Caption, Detect, Point)
8+
- Tensor parallelism (TP=2)
9+
- Various image sizes
10+
"""
11+
12+
import pytest
13+
import torch
14+
from PIL import Image
15+
16+
from ....utils import large_gpu_mark
17+
18+
MOONDREAM3_MODEL_ID = "moondream/moondream3-preview"
19+
MOONDREAM3_TOKENIZER = "moondream/starmie-v1"
20+
21+
22+
def make_query_prompt(question: str) -> str:
23+
"""Create a query prompt for Moondream3."""
24+
return f"<|endoftext|><image> \n\nQuestion: {question}\n\nAnswer:"
25+
26+
27+
def make_caption_prompt() -> str:
28+
"""Create a caption prompt for Moondream3."""
29+
return "<|endoftext|><image> \n\nDescribe this image.\n\n"
30+
31+
32+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
33+
@large_gpu_mark(min_gb=48)
34+
def test_model_loading(model_id: str):
35+
"""Test that the model loads without errors."""
36+
from vllm import LLM
37+
38+
llm = LLM(
39+
model=model_id,
40+
tokenizer=MOONDREAM3_TOKENIZER,
41+
trust_remote_code=True,
42+
dtype="bfloat16",
43+
max_model_len=2048,
44+
enforce_eager=True,
45+
)
46+
assert llm is not None
47+
48+
49+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
50+
@large_gpu_mark(min_gb=48)
51+
def test_query_skill(model_id: str):
52+
"""Test query (question answering) skill."""
53+
from vllm import LLM, SamplingParams
54+
55+
llm = LLM(
56+
model=model_id,
57+
tokenizer=MOONDREAM3_TOKENIZER,
58+
trust_remote_code=True,
59+
dtype="bfloat16",
60+
max_model_len=2048,
61+
enforce_eager=True,
62+
)
63+
64+
image = Image.new("RGB", (378, 378), color="blue")
65+
prompt = make_query_prompt("What color is this image?")
66+
67+
outputs = llm.generate(
68+
{"prompt": prompt, "multi_modal_data": {"image": image}},
69+
SamplingParams(max_tokens=50, temperature=0),
70+
)
71+
72+
output_text = outputs[0].outputs[0].text
73+
assert output_text is not None
74+
assert len(output_text) > 0
75+
76+
77+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
78+
@large_gpu_mark(min_gb=48)
79+
def test_caption_skill(model_id: str):
80+
"""Test caption (image description) skill."""
81+
from vllm import LLM, SamplingParams
82+
83+
llm = LLM(
84+
model=model_id,
85+
tokenizer=MOONDREAM3_TOKENIZER,
86+
trust_remote_code=True,
87+
dtype="bfloat16",
88+
max_model_len=2048,
89+
enforce_eager=True,
90+
)
91+
92+
image = Image.new("RGB", (378, 378), color="green")
93+
prompt = make_caption_prompt()
94+
95+
outputs = llm.generate(
96+
{"prompt": prompt, "multi_modal_data": {"image": image}},
97+
SamplingParams(max_tokens=100, temperature=0),
98+
)
99+
100+
output_text = outputs[0].outputs[0].text
101+
assert output_text is not None
102+
assert len(output_text) > 0
103+
104+
105+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
106+
@large_gpu_mark(min_gb=48)
107+
def test_batched_inference(model_id: str):
108+
"""Test batched inference with multiple images."""
109+
from vllm import LLM, SamplingParams
110+
111+
llm = LLM(
112+
model=model_id,
113+
tokenizer=MOONDREAM3_TOKENIZER,
114+
trust_remote_code=True,
115+
dtype="bfloat16",
116+
max_model_len=2048,
117+
enforce_eager=True,
118+
)
119+
120+
images = [
121+
Image.new("RGB", (378, 378), color="red"),
122+
Image.new("RGB", (378, 378), color="blue"),
123+
]
124+
prompts = [
125+
{"prompt": make_query_prompt("What color?"), "multi_modal_data": {"image": img}}
126+
for img in images
127+
]
128+
129+
outputs = llm.generate(prompts, SamplingParams(max_tokens=20, temperature=0))
130+
131+
assert len(outputs) == 2
132+
for output in outputs:
133+
assert output.outputs[0].text is not None
134+
135+
136+
@pytest.mark.parametrize(
137+
"image_size",
138+
[(100, 100), (378, 378), (800, 600), (1920, 1080)],
139+
)
140+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
141+
@large_gpu_mark(min_gb=48)
142+
def test_various_image_sizes(image_size: tuple[int, int], model_id: str):
143+
"""Test inference with various image sizes."""
144+
from vllm import LLM, SamplingParams
145+
146+
llm = LLM(
147+
model=model_id,
148+
tokenizer=MOONDREAM3_TOKENIZER,
149+
trust_remote_code=True,
150+
dtype="bfloat16",
151+
max_model_len=2048,
152+
enforce_eager=True,
153+
)
154+
155+
width, height = image_size
156+
image = Image.new("RGB", (width, height), color="purple")
157+
prompt = make_query_prompt("Describe.")
158+
159+
outputs = llm.generate(
160+
{"prompt": prompt, "multi_modal_data": {"image": image}},
161+
SamplingParams(max_tokens=20, temperature=0),
162+
)
163+
164+
assert outputs[0].outputs[0].text is not None
165+
166+
167+
@pytest.mark.skipif(
168+
torch.cuda.device_count() < 2, reason="Requires at least 2 GPUs for TP=2"
169+
)
170+
@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
171+
@large_gpu_mark(min_gb=48)
172+
def test_tensor_parallel(model_id: str):
173+
"""Test model with tensor parallelism = 2."""
174+
from vllm import LLM, SamplingParams
175+
176+
llm = LLM(
177+
model=model_id,
178+
tokenizer=MOONDREAM3_TOKENIZER,
179+
trust_remote_code=True,
180+
dtype="bfloat16",
181+
tensor_parallel_size=2,
182+
max_model_len=2048,
183+
enforce_eager=True,
184+
)
185+
186+
image = Image.new("RGB", (378, 378), color="red")
187+
prompt = make_query_prompt("What is this?")
188+
189+
outputs = llm.generate(
190+
{"prompt": prompt, "multi_modal_data": {"image": image}},
191+
SamplingParams(max_tokens=20, temperature=0),
192+
)
193+
194+
assert len(outputs) > 0
195+
assert outputs[0].outputs[0].text is not None
196+
197+
198+
if __name__ == "__main__":
199+
pytest.main([__file__, "-v"])

tests/models/multimodal/generation/vlm_utils/custom_inputs.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,87 @@ def video_with_metadata_glm4_1v():
154154
video_data=video_input,
155155
)
156156
]
157+
158+
159+
def moondream3_skill_inputs():
160+
"""Builds inputs for Moondream3 testing all four skills.
161+
162+
Skills:
163+
- Query: Question answering
164+
- Caption: Image captioning
165+
- Detect: Object detection (returns bounding boxes)
166+
- Point: Object pointing (returns coordinates)
167+
"""
168+
stop_sign = IMAGE_ASSETS[0].pil_image
169+
cherry_blossom = IMAGE_ASSETS[1].pil_image
170+
171+
# Moondream3 prompt format: <|endoftext|><image> \n\n{task}
172+
# Note: space after <image> is required for correct tokenization
173+
174+
# Test different skills with appropriate prompts
175+
prompts = [
176+
# Query skill - question answering
177+
"<|endoftext|><image> \n\nQuestion: What is shown in this image?\n\nAnswer:",
178+
# Caption skill - image description
179+
"<|endoftext|><image> \n\nDescribe this image.\n\n",
180+
# Query skill - specific question
181+
"<|endoftext|><image> \n\nQuestion: What colors do you see?\n\nAnswer:",
182+
]
183+
184+
images = [
185+
stop_sign,
186+
cherry_blossom,
187+
stop_sign,
188+
]
189+
190+
return [
191+
PromptWithMultiModalInput(
192+
prompts=prompts,
193+
image_data=images,
194+
)
195+
]
196+
197+
198+
def moondream3_multi_size_inputs():
199+
"""Builds inputs for Moondream3 with various image sizes.
200+
201+
Tests the multi-crop tiling functionality with different
202+
image sizes and aspect ratios.
203+
"""
204+
stop_sign = IMAGE_ASSETS[0].pil_image
205+
cherry_blossom = IMAGE_ASSETS[1].pil_image
206+
207+
# Create images of different sizes to test multi-crop tiling
208+
small_image = stop_sign.resize((200, 200))
209+
medium_image = stop_sign # Original size
210+
large_image = cherry_blossom.resize((1200, 800))
211+
tall_image = stop_sign.resize((300, 900))
212+
wide_image = cherry_blossom.resize((1000, 300))
213+
214+
prompts = [
215+
# Small image (should use 1x1 tiling)
216+
"<|endoftext|><image> \n\nQuestion: Describe this small image.\n\nAnswer:",
217+
# Medium image
218+
"<|endoftext|><image> \n\nQuestion: What do you see?\n\nAnswer:",
219+
# Large image (should use multi-crop)
220+
"<|endoftext|><image> \n\nQuestion: Describe this large image.\n\nAnswer:",
221+
# Tall image (different aspect ratio)
222+
"<|endoftext|><image> \n\nQuestion: Describe this tall image.\n\nAnswer:",
223+
# Wide image (different aspect ratio)
224+
"<|endoftext|><image> \n\nQuestion: Describe this wide image.\n\nAnswer:",
225+
]
226+
227+
images = [
228+
small_image,
229+
medium_image,
230+
large_image,
231+
tall_image,
232+
wide_image,
233+
]
234+
235+
return [
236+
PromptWithMultiModalInput(
237+
prompts=prompts,
238+
image_data=images,
239+
)
240+
]

0 commit comments

Comments
 (0)