Skip to content

Commit 4ef00b5

Browse files
[VLM] Add Nemotron-Nano-VL-8B-V1 support (#20349)
Signed-off-by: Kyle Huang <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent 5a7fb3a commit 4ef00b5

File tree

11 files changed

+701
-3
lines changed

11 files changed

+701
-3
lines changed

docker/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ WORKDIR /workspace/vllm
9595
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
9696
cp requirements/test.in requirements/cpu-test.in && \
9797
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
98-
sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
98+
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
9999
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
100100
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
101101
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ Specified using `--task generate`.
581581
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
582582
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
583583
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
584+
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
584585
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
585586
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
586587
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
429429
)
430430

431431

432+
# Nemontron_VL
433+
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
434+
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
435+
436+
engine_args = EngineArgs(
437+
model=model_name,
438+
trust_remote_code=True,
439+
max_model_len=8192,
440+
limit_mm_per_prompt={modality: 1},
441+
)
442+
443+
assert modality == "image"
444+
placeholder = "<image>"
445+
446+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
447+
messages = [
448+
[{"role": "user", "content": f"{placeholder}\n{question}"}]
449+
for question in questions
450+
]
451+
prompts = tokenizer.apply_chat_template(
452+
messages, tokenize=False, add_generation_prompt=True
453+
)
454+
455+
# Stop tokens for InternVL
456+
# models variants may have different stop tokens
457+
# please refer to the model card for the correct "stop words":
458+
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
459+
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
460+
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
461+
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
462+
463+
return ModelRequestData(
464+
engine_args=engine_args,
465+
prompts=prompts,
466+
stop_token_ids=stop_token_ids,
467+
)
468+
469+
432470
# Keye-VL
433471
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
434472
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@@ -1186,6 +1224,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
11861224
"h2ovl_chat": run_h2ovl,
11871225
"idefics3": run_idefics3,
11881226
"internvl_chat": run_internvl,
1227+
"nemotron_vl": run_nemotron_vl,
11891228
"keye_vl": run_keye_vl,
11901229
"kimi_vl": run_kimi_vl,
11911230
"llava": run_llava,

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ mamba_ssm # required for plamo2 test
3030
matplotlib # required for qwen-vl test
3131
mistral_common[opencv] >= 1.8.0 # required for voxtral test
3232
num2words # required for smolvlm test
33+
open_clip_torch==2.32.0 # Required for nemotron_vl test
3334
opencv-python-headless >= 4.11.0 # required for video test
3435
datamodel_code_generator # required for minicpm3 test
3536
lm-eval[api]==0.4.8 # required for model evaluation test

requirements/test.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ fsspec==2024.9.0
174174
# fastparquet
175175
# huggingface-hub
176176
# torch
177+
ftfy==6.3.1
178+
# via open-clip-torch
177179
genai-perf==0.0.8
178180
# via -r requirements/test.in
179181
genson==1.3.0
@@ -208,6 +210,7 @@ huggingface-hub==0.33.0
208210
# accelerate
209211
# datasets
210212
# evaluate
213+
# open-clip-torch
211214
# peft
212215
# sentence-transformers
213216
# timm
@@ -414,6 +417,8 @@ nvidia-nvjitlink-cu12==12.8.61
414417
# torch
415418
nvidia-nvtx-cu12==12.8.55
416419
# via torch
420+
open-clip-torch==2.32.0
421+
# via -r requirements/test.in
417422
opencensus==0.11.4
418423
# via ray
419424
opencensus-context==0.1.3
@@ -615,6 +620,7 @@ referencing==0.35.1
615620
regex==2024.9.11
616621
# via
617622
# nltk
623+
# open-clip-torch
618624
# sacrebleu
619625
# tiktoken
620626
# transformers
@@ -665,6 +671,7 @@ sacrebleu==2.4.3
665671
safetensors==0.4.5
666672
# via
667673
# accelerate
674+
# open-clip-torch
668675
# peft
669676
# timm
670677
# transformers
@@ -753,7 +760,9 @@ tiktoken==0.7.0
753760
# lm-eval
754761
# mistral-common
755762
timm==1.0.11
756-
# via -r requirements/test.in
763+
# via
764+
# -r requirements/test.in
765+
# open-clip-torch
757766
tokenizers==0.21.1
758767
# via
759768
# -r requirements/test.in
@@ -772,6 +781,7 @@ torch==2.7.1+cu128
772781
# lm-eval
773782
# mamba-ssm
774783
# mteb
784+
# open-clip-torch
775785
# peft
776786
# runai-model-streamer
777787
# sentence-transformers
@@ -789,6 +799,7 @@ torchaudio==2.7.1+cu128
789799
torchvision==0.22.1+cu128
790800
# via
791801
# -r requirements/test.in
802+
# open-clip-torch
792803
# timm
793804
tqdm==4.66.6
794805
# via
@@ -798,6 +809,7 @@ tqdm==4.66.6
798809
# lm-eval
799810
# mteb
800811
# nltk
812+
# open-clip-torch
801813
# peft
802814
# pqdm
803815
# sentence-transformers
@@ -863,6 +875,8 @@ virtualenv==20.31.2
863875
# via ray
864876
vocos==0.1.0
865877
# via -r requirements/test.in
878+
wcwidth==0.2.13
879+
# via ftfy
866880
webcolors==24.11.1
867881
# via jsonschema
868882
werkzeug==3.1.3

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ def _test_processing_correctness_one(
291291
"allenai/Molmo-7B-D-0924",
292292
"allenai/Molmo-7B-O-0924",
293293
"nvidia/NVLM-D-72B",
294+
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
294295
"AIDC-AI/Ovis1.6-Gemma2-9B",
295296
"AIDC-AI/Ovis1.6-Llama3.2-3B",
296297
"AIDC-AI/Ovis2-1B",
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
4+
from collections.abc import Mapping
5+
from typing import Optional
6+
7+
import pytest
8+
from PIL import Image
9+
from transformers import PretrainedConfig
10+
11+
from vllm.multimodal import MULTIMODAL_REGISTRY
12+
from vllm.multimodal.image import rescale_image_size
13+
from vllm.multimodal.processing import BaseMultiModalProcessor
14+
15+
from ....conftest import ImageTestAssets
16+
from ...utils import build_model_context
17+
18+
19+
def _get_expected_num_patches(
20+
config: PretrainedConfig,
21+
image: Image.Image,
22+
num_imgs: int,
23+
min_num: int,
24+
max_num: int,
25+
):
26+
from vllm.model_executor.models.internvl import (
27+
calculate_internvl_targets, get_internvl_target_ratios)
28+
29+
width, height = image.size
30+
31+
blocks, _, _ = calculate_internvl_targets(
32+
orig_width=width,
33+
orig_height=height,
34+
target_ratios=get_internvl_target_ratios(
35+
min_num,
36+
max_num,
37+
),
38+
image_size=config.force_image_size,
39+
use_thumbnail=False,
40+
)
41+
expected_num_patches = blocks
42+
43+
if config.use_thumbnail and expected_num_patches > 1:
44+
expected_num_patches += 1
45+
46+
return expected_num_patches
47+
48+
49+
def _run_check(
50+
processor: BaseMultiModalProcessor,
51+
images: list[Image.Image],
52+
min_num: int,
53+
max_num: int,
54+
mm_processor_kwargs: Mapping[str, object],
55+
):
56+
tokenizer = processor.info.get_tokenizer()
57+
config = processor.info.get_hf_config()
58+
image_processor = processor.info.get_image_processor()
59+
60+
config.use_thumbnail = image_processor.use_thumbnail
61+
prompt = "<image>" * len(images)
62+
mm_data = {"image": images}
63+
64+
total_expected_num_patches = sum(
65+
_get_expected_num_patches(config, image, len(images), min_num, max_num)
66+
for image in images)
67+
print(total_expected_num_patches)
68+
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
69+
70+
# Ensure we have the right number of placeholders per num_crops size
71+
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
72+
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
73+
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
74+
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
75+
assert img_tok_count == 256 * total_expected_num_patches
76+
assert pixel_shape[0] == total_expected_num_patches
77+
78+
79+
@pytest.mark.parametrize("model_id",
80+
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
81+
@pytest.mark.parametrize(
82+
"size_factors",
83+
[
84+
# Single-scale
85+
[1.0],
86+
# Single-scale, batched
87+
[1.0, 1.0, 1.0],
88+
# Multi-scale
89+
[0.25, 0.5, 1.0],
90+
[4.0, 2.0, 1.0],
91+
],
92+
)
93+
@pytest.mark.parametrize(
94+
("min_dynamic_patch", "max_dynamic_patch"),
95+
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
96+
)
97+
@pytest.mark.parametrize("dynamic_image_size", [True, False])
98+
@pytest.mark.parametrize("kwargs_on_init", [True, False])
99+
def test_processor_override(
100+
model_id: str,
101+
image_assets: ImageTestAssets,
102+
size_factors: list[int],
103+
min_dynamic_patch: int,
104+
max_dynamic_patch: int,
105+
dynamic_image_size: Optional[bool],
106+
kwargs_on_init: bool,
107+
):
108+
mm_processor_kwargs = {
109+
"min_dynamic_patch": min_dynamic_patch,
110+
"max_dynamic_patch": max_dynamic_patch,
111+
"dynamic_image_size": dynamic_image_size,
112+
}
113+
114+
ctx = build_model_context(
115+
model_id,
116+
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
117+
limit_mm_per_prompt={"image": len(size_factors)},
118+
)
119+
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
120+
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
121+
122+
min_num = min_dynamic_patch if dynamic_image_size else 1
123+
max_num = max_dynamic_patch if dynamic_image_size else 1
124+
125+
_run_check(
126+
processor,
127+
[
128+
rescale_image_size(image_assets[0].pil_image, f)
129+
for f in size_factors
130+
],
131+
min_num,
132+
max_num,
133+
hf_processor_mm_kwargs,
134+
)

tests/models/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,8 @@ def check_available_online(
401401
trust_remote_code=True),
402402
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
403403
trust_remote_code=True),
404+
"Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
405+
trust_remote_code=True),
404406
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
405407
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
406408
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",

0 commit comments

Comments
 (0)