Skip to content

Commit 102bf96

Browse files
[Model] Add smolvlm support (#16017)
Signed-off-by: chaunceyjiang <[email protected]>
1 parent 1f4b09b commit 102bf96

File tree

14 files changed

+219
-6
lines changed

14 files changed

+219
-6
lines changed

docs/source/models/supported_models.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,13 @@ See [this page](#generative-models) for more information on how to use generativ
990990
*
991991
* ✅︎
992992
* ✅︎
993+
- * `SmolVLMForConditionalGeneration`
994+
* SmolVLM2
995+
* T + I
996+
* `SmolVLM2-2.2B-Instruct`
997+
*
998+
* ✅︎
999+
* ✅︎
9931000
- * `UltravoxModel`
9941001
* Ultravox
9951002
* T + A<sup>E+</sup>

examples/offline_inference/vision_language.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
298298
)
299299

300300

301+
# SmolVLM2-2.2B-Instruct
302+
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
303+
assert modality == "image"
304+
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
305+
306+
engine_args = EngineArgs(
307+
model=model_name,
308+
max_model_len=8192,
309+
max_num_seqs=2,
310+
enforce_eager=True,
311+
mm_processor_kwargs={
312+
"max_image_size": {
313+
"longest_edge": 384
314+
},
315+
},
316+
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
317+
)
318+
prompts = [
319+
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
320+
for question in questions
321+
]
322+
323+
return ModelRequestData(
324+
engine_args=engine_args,
325+
prompts=prompts,
326+
)
327+
328+
301329
# InternVL
302330
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
303331
assert modality == "image"
@@ -955,6 +983,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
955983
"qwen2_vl": run_qwen2_vl,
956984
"qwen2_5_vl": run_qwen2_5_vl,
957985
"skywork_chat": run_skyworkr1v,
986+
"smolvlm": run_smolvlm,
958987
}
959988

960989

examples/offline_inference/vision_language_multi_image.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
217217
)
218218

219219

220+
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
221+
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
222+
223+
# The configuration below has been confirmed to launch on a single L40 GPU.
224+
engine_args = EngineArgs(
225+
model=model_name,
226+
max_model_len=8192,
227+
max_num_seqs=16,
228+
enforce_eager=True,
229+
limit_mm_per_prompt={"image": len(image_urls)},
230+
mm_processor_kwargs={
231+
"max_image_size": {
232+
"longest_edge": 384
233+
},
234+
},
235+
)
236+
237+
placeholders = "\n".join(f"Image-{i}: <image>\n"
238+
for i, _ in enumerate(image_urls, start=1))
239+
prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
240+
return ModelRequestData(
241+
engine_args=engine_args,
242+
prompt=prompt,
243+
image_data=[fetch_image(url) for url in image_urls],
244+
)
245+
246+
220247
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
221248
model_name = "OpenGVLab/InternVL2-2B"
222249

@@ -614,6 +641,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
614641
"qwen_vl_chat": load_qwen_vl_chat,
615642
"qwen2_vl": load_qwen2_vl,
616643
"qwen2_5_vl": load_qwen2_5_vl,
644+
"smolvlm": load_smolvlm,
617645
}
618646

619647

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ torchvision==0.21.0
2828
transformers_stream_generator # required for qwen-vl test
2929
matplotlib # required for qwen-vl test
3030
mistral_common[opencv] >= 1.5.4 # required for pixtral test
31+
num2words # required for smolvlm test
3132
opencv-python-headless >= 4.11.0 # required for video test
3233
datamodel_code_generator # required for minicpm3 test
3334
lm-eval[api]==0.4.8 # required for model evaluation test

requirements/test.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ dill==0.3.8
101101
# multiprocess
102102
dnspython==2.7.0
103103
# via email-validator
104+
docopt==0.6.2
105+
# via num2words
104106
docutils==0.16
105107
# via awscli
106108
einops==0.8.0
@@ -263,6 +265,8 @@ networkx==3.2.1
263265
# via torch
264266
nltk==3.9.1
265267
# via rouge-score
268+
num2words==0.5.14
269+
# via -r requirements/test.in
266270
numba==0.61.0
267271
# via
268272
# -r requirements/test.in

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,16 @@
493493
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
494494
marks=[large_gpu_mark(min_gb=80)],
495495
),
496+
"smolvlm": VLMTestInfo(
497+
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
498+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
499+
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
500+
img_idx_to_prompt=lambda idx: "<image>",
501+
max_model_len=8192,
502+
max_num_seqs=2,
503+
auto_cls=AutoModelForImageTextToText,
504+
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
505+
),
496506
### Tensor parallel / multi-gpu broadcast tests
497507
"chameleon-broadcast": VLMTestInfo(
498508
models=["facebook/chameleon-7b"],

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
204204
return output_ids, output_str, out_logprobs
205205

206206

207+
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
208+
model: str) -> RunnerOutput:
209+
# Based on Idefics3
210+
return idefics3_trunc_hf_output(hf_output, model)
211+
212+
207213
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
208214
model: str) -> RunnerOutput:
209215
output_ids, output_str, out_logprobs = hf_output

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
257257
"h2oai/h2ovl-mississippi-800m",
258258
"OpenGVLab/InternVL2-1B",
259259
"HuggingFaceM4/Idefics3-8B-Llama3",
260+
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
260261
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
261262
"llava-hf/llava-1.5-7b-hf",
262263
"llava-hf/llava-v1.6-mistral-7b-hf",
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""Tests for smolvlm's multimodal preprocessing kwargs."""
3+
import pytest
4+
from transformers import SmolVLMConfig
5+
6+
from vllm.multimodal import MULTIMODAL_REGISTRY
7+
8+
from ....conftest import _ImageAssets
9+
from ...utils import build_model_context
10+
11+
12+
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
13+
# yapf: disable
14+
@pytest.mark.parametrize(
15+
("mm_processor_kwargs", "expected_toks_per_img"),
16+
[
17+
({"max_image_size": {"longest_edge": 384}}, 1377),
18+
({"max_image_size": {"longest_edge": 768}}, 405),
19+
])
20+
# yapf: enable
21+
@pytest.mark.parametrize("num_imgs", [1, 2])
22+
@pytest.mark.parametrize("kwargs_on_init", [True, False])
23+
def test_processor_override(
24+
image_assets: _ImageAssets,
25+
model_id: str,
26+
mm_processor_kwargs: dict[str, object],
27+
expected_toks_per_img: int,
28+
num_imgs: int,
29+
kwargs_on_init: bool,
30+
):
31+
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
32+
# Same as the previous test - don't initialize mm_processor_kwargs
33+
# in this test and assume that the kwargs will be correctly expanded by
34+
# the partial when calling the custom input processor.
35+
ctx = build_model_context(
36+
model_id,
37+
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
38+
limit_mm_per_prompt={"image": num_imgs},
39+
)
40+
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
41+
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
42+
43+
# Build the image str / prompt based on the number of images we pass
44+
placeholders = "<image>" if num_imgs == 1 else "\n".join(
45+
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
46+
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
47+
48+
# Build mm_data
49+
image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
50+
dummy_image_size = (image_size * 4, image_size * 4)
51+
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
52+
mm_data = {"image": [dummy_image] * num_imgs}
53+
54+
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
55+
56+
# Ensure the placeholders format are correct
57+
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
58+
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
59+
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
60+
"input_ids"][0]
61+
62+
# Ensure we have the right number of placeholders per num_crops size
63+
image_token_id = ctx.get_hf_config().image_token_id
64+
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
65+
assert img_tok_count == expected_toks_per_img * num_imgs

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ def check_available_online(
344344
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
345345
min_transformers_version="4.49"), # noqa: E501
346346
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
347+
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
347348
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
348349
trust_remote_code=True,
349350
max_transformers_version="4.50"),

0 commit comments

Comments
 (0)