vllm-project
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 21 additions & 1 deletion b/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎tests/models/multimodal/generation/test_moondream3_standalone.py‎
Lines changed: 199 additions & 0 deletions b/‎tests/models/multimodal/generation/test_moondream3_standalone.py‎
Lines changed: 199 additions & 0 deletions
diff --git a/‎tests/models/multimodal/generation/vlm_utils/custom_inputs.py‎
Lines changed: 84 additions & 0 deletions b/‎tests/models/multimodal/generation/vlm_utils/custom_inputs.py‎
Lines changed: 84 additions & 0 deletions
@@ -734,7 +734,7 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` 
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 
 
@@ -710,13 +710,33 @@
     ),
     "moondream3": VLMTestInfo(
         models=["moondream/moondream3-preview"],
-        test_type=VLMTestType.IMAGE,
+        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
         prompt_formatter=lambda img_prompt: f"<|endoftext|>{img_prompt}",
         # Note: space after <image> is required for correct tokenization
         img_idx_to_prompt=lambda idx: "<image> \n\n",
+        # Moondream3-specific prompts to test query and caption skills
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "Question: What is this sign?\n\nAnswer:",
+                "cherry_blossom": "Question: What season is shown?\n\nAnswer:",
+            }
+        ),
         max_model_len=2048,
         max_num_seqs=2,
         dtype="bfloat16",
+        patch_hf_runner=model_utils.moondream3_patch_hf_runner,
+        hf_model_kwargs={"trust_remote_code": True},
+        # Custom inputs to test all Moondream3 skills
+        custom_test_opts=[
+            CustomTestOptions(
+                inputs=custom_inputs.moondream3_skill_inputs(),
+                limit_mm_per_prompt={"image": 1},
+            ),
+            CustomTestOptions(
+                inputs=custom_inputs.moondream3_multi_size_inputs(),
+                limit_mm_per_prompt={"image": 1},
+            ),
+        ],
         # Moondream3 is 9B params with MoE, needs significant GPU memory
         marks=[large_gpu_mark(min_gb=48)],
     ),
 
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Standalone generation tests for Moondream3 model.
+
+These tests verify end-to-end inference capabilities including:
+- Basic model loading and generation
+- Multi-skill support (Query, Caption, Detect, Point)
+- Tensor parallelism (TP=2)
+- Various image sizes
+"""
+
+import pytest
+import torch
+from PIL import Image
+
+from ....utils import large_gpu_mark
+
+MOONDREAM3_MODEL_ID = "moondream/moondream3-preview"
+MOONDREAM3_TOKENIZER = "moondream/starmie-v1"
+
+
+def make_query_prompt(question: str) -> str:
+    """Create a query prompt for Moondream3."""
+    return f"<|endoftext|><image> \n\nQuestion: {question}\n\nAnswer:"
+
+
+def make_caption_prompt() -> str:
+    """Create a caption prompt for Moondream3."""
+    return "<|endoftext|><image> \n\nDescribe this image.\n\n"
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_model_loading(model_id: str):
+    """Test that the model loads without errors."""
+    from vllm import LLM
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+    assert llm is not None
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_query_skill(model_id: str):
+    """Test query (question answering) skill."""
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+
+    image = Image.new("RGB", (378, 378), color="blue")
+    prompt = make_query_prompt("What color is this image?")
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=50, temperature=0),
+    )
+
+    output_text = outputs[0].outputs[0].text
+    assert output_text is not None
+    assert len(output_text) > 0
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_caption_skill(model_id: str):
+    """Test caption (image description) skill."""
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+
+    image = Image.new("RGB", (378, 378), color="green")
+    prompt = make_caption_prompt()
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=100, temperature=0),
+    )
+
+    output_text = outputs[0].outputs[0].text
+    assert output_text is not None
+    assert len(output_text) > 0
+
+
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_batched_inference(model_id: str):
+    """Test batched inference with multiple images."""
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+
+    images = [
+        Image.new("RGB", (378, 378), color="red"),
+        Image.new("RGB", (378, 378), color="blue"),
+    ]
+    prompts = [
+        {"prompt": make_query_prompt("What color?"), "multi_modal_data": {"image": img}}
+        for img in images
+    ]
+
+    outputs = llm.generate(prompts, SamplingParams(max_tokens=20, temperature=0))
+
+    assert len(outputs) == 2
+    for output in outputs:
+        assert output.outputs[0].text is not None
+
+
+@pytest.mark.parametrize(
+    "image_size",
+    [(100, 100), (378, 378), (800, 600), (1920, 1080)],
+)
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_various_image_sizes(image_size: tuple[int, int], model_id: str):
+    """Test inference with various image sizes."""
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+
+    width, height = image_size
+    image = Image.new("RGB", (width, height), color="purple")
+    prompt = make_query_prompt("Describe.")
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=20, temperature=0),
+    )
+
+    assert outputs[0].outputs[0].text is not None
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Requires at least 2 GPUs for TP=2"
+)
+@pytest.mark.parametrize("model_id", [MOONDREAM3_MODEL_ID])
+@large_gpu_mark(min_gb=48)
+def test_tensor_parallel(model_id: str):
+    """Test model with tensor parallelism = 2."""
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(
+        model=model_id,
+        tokenizer=MOONDREAM3_TOKENIZER,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        tensor_parallel_size=2,
+        max_model_len=2048,
+        enforce_eager=True,
+    )
+
+    image = Image.new("RGB", (378, 378), color="red")
+    prompt = make_query_prompt("What is this?")
+
+    outputs = llm.generate(
+        {"prompt": prompt, "multi_modal_data": {"image": image}},
+        SamplingParams(max_tokens=20, temperature=0),
+    )
+
+    assert len(outputs) > 0
+    assert outputs[0].outputs[0].text is not None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
@@ -154,3 +154,87 @@ def video_with_metadata_glm4_1v():
             video_data=video_input,
         )
     ]
+
+
+def moondream3_skill_inputs():
+    """Builds inputs for Moondream3 testing all four skills.
+
+    Skills:
+    - Query: Question answering
+    - Caption: Image captioning
+    - Detect: Object detection (returns bounding boxes)
+    - Point: Object pointing (returns coordinates)
+    """
+    stop_sign = IMAGE_ASSETS[0].pil_image
+    cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+    # Moondream3 prompt format: <|endoftext|><image> \n\n{task}
+    # Note: space after <image> is required for correct tokenization
+
+    # Test different skills with appropriate prompts
+    prompts = [
+        # Query skill - question answering
+        "<|endoftext|><image> \n\nQuestion: What is shown in this image?\n\nAnswer:",
+        # Caption skill - image description
+        "<|endoftext|><image> \n\nDescribe this image.\n\n",
+        # Query skill - specific question
+        "<|endoftext|><image> \n\nQuestion: What colors do you see?\n\nAnswer:",
+    ]
+
+    images = [
+        stop_sign,
+        cherry_blossom,
+        stop_sign,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            image_data=images,
+        )
+    ]
+
+
+def moondream3_multi_size_inputs():
+    """Builds inputs for Moondream3 with various image sizes.
+
+    Tests the multi-crop tiling functionality with different
+    image sizes and aspect ratios.
+    """
+    stop_sign = IMAGE_ASSETS[0].pil_image
+    cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+    # Create images of different sizes to test multi-crop tiling
+    small_image = stop_sign.resize((200, 200))
+    medium_image = stop_sign  # Original size
+    large_image = cherry_blossom.resize((1200, 800))
+    tall_image = stop_sign.resize((300, 900))
+    wide_image = cherry_blossom.resize((1000, 300))
+
+    prompts = [
+        # Small image (should use 1x1 tiling)
+        "<|endoftext|><image> \n\nQuestion: Describe this small image.\n\nAnswer:",
+        # Medium image
+        "<|endoftext|><image> \n\nQuestion: What do you see?\n\nAnswer:",
+        # Large image (should use multi-crop)
+        "<|endoftext|><image> \n\nQuestion: Describe this large image.\n\nAnswer:",
+        # Tall image (different aspect ratio)
+        "<|endoftext|><image> \n\nQuestion: Describe this tall image.\n\nAnswer:",
+        # Wide image (different aspect ratio)
+        "<|endoftext|><image> \n\nQuestion: Describe this wide image.\n\nAnswer:",
+    ]
+
+    images = [
+        small_image,
+        medium_image,
+        large_image,
+        tall_image,
+        wide_image,
+    ]
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            image_data=images,
+        )
+    ]