Skip to content

Commit ed495b7

Browse files
courage17340dbyoung18
authored andcommitted
[Model][VLM] Add Kimi-VL model support (vllm-project#16387)
Signed-off-by: courage17340 <[email protected]>
1 parent fa7aba5 commit ed495b7

File tree

18 files changed

+1436
-14
lines changed

18 files changed

+1436
-14
lines changed

docs/source/models/supported_models.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,13 @@ See [this page](#generative-models) for more information on how to use generativ
886886
*
887887
* ✅︎
888888
* ✅︎
889+
- * `KimiVLForConditionalGeneration`
890+
* Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
891+
* T + I<sup>+</sup>
892+
* `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
893+
*
894+
*
895+
* ✅︎
889896
- * `Llama4ForConditionalGeneration`
890897
* Llama 4
891898
* T + I<sup>+</sup>

examples/offline_inference/vision_language.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
364364
)
365365

366366

367+
# Kimi-VL
368+
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
369+
assert modality == "image"
370+
371+
prompts = [
372+
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
373+
f"<|media_pad|><|media_end|>{question}<|im_end|>"
374+
"<|im_assistant|>assistant<|im_middle|>" for question in questions
375+
]
376+
377+
engine_args = EngineArgs(
378+
model="moonshotai/Kimi-VL-A3B-Instruct",
379+
max_model_len=4096,
380+
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
381+
trust_remote_code=True,
382+
)
383+
384+
return ModelRequestData(
385+
engine_args=engine_args,
386+
prompts=prompts,
387+
)
388+
389+
367390
# LLaVA-1.5
368391
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
369392
assert modality == "image"
@@ -966,6 +989,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
966989
"h2ovl_chat": run_h2ovl,
967990
"idefics3": run_idefics3,
968991
"internvl_chat": run_internvl,
992+
"kimi_vl": run_kimi_vl,
969993
"llava": run_llava,
970994
"llava-next": run_llava_next,
971995
"llava-next-video": run_llava_next_video,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,45 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
326326
)
327327

328328

329+
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
330+
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
331+
332+
engine_args = EngineArgs(
333+
model=model_name,
334+
max_model_len=4096,
335+
max_num_seqs=4,
336+
tensor_parallel_size=1,
337+
limit_mm_per_prompt={"image": len(image_urls)},
338+
trust_remote_code=True,
339+
)
340+
341+
placeholders = [{"type": "image", "image": url} for url in image_urls]
342+
messages = [{
343+
"role":
344+
"user",
345+
"content": [
346+
*placeholders,
347+
{
348+
"type": "text",
349+
"text": question
350+
},
351+
],
352+
}]
353+
354+
processor = AutoProcessor.from_pretrained(model_name,
355+
trust_remote_code=True)
356+
357+
prompt = processor.apply_chat_template(messages,
358+
tokenize=False,
359+
add_generation_prompt=True)
360+
361+
return ModelRequestData(
362+
engine_args=engine_args,
363+
prompt=prompt,
364+
image_data=[fetch_image(url) for url in image_urls],
365+
)
366+
367+
329368
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
330369
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
331370

@@ -640,6 +679,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
640679
"h2ovl_chat": load_h2ovl,
641680
"idefics3": load_idefics3,
642681
"internvl_chat": load_internvl,
682+
"kimi_vl": load_kimi_vl,
643683
"llama4": load_llama4,
644684
"mistral3": load_mistral3,
645685
"mllama": load_mllama,

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pytest-timeout
1010
# testing utils
1111
awscli
1212
backoff # required for phi4mm test
13+
blobfile # required for kimi-vl test
1314
einops # required for MPT, qwen-vl and Mamba
1415
httpx
1516
librosa # required for audio tests

requirements/test.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ bitsandbytes==0.45.3
3939
# via -r requirements/test.in
4040
black==24.10.0
4141
# via datamodel-code-generator
42+
blobfile==3.0.0
43+
# via -r requirements/test.in
4244
boto3==1.35.57
4345
# via tensorizer
4446
botocore==1.35.57
@@ -127,6 +129,7 @@ fastsafetensors==0.1.10
127129
# via -r requirements/test.in
128130
filelock==3.16.1
129131
# via
132+
# blobfile
130133
# datasets
131134
# huggingface-hub
132135
# ray
@@ -227,7 +230,9 @@ llvmlite==0.44.0
227230
lm-eval==0.4.8
228231
# via -r requirements/test.in
229232
lxml==5.3.0
230-
# via sacrebleu
233+
# via
234+
# blobfile
235+
# sacrebleu
231236
markdown-it-py==3.0.0
232237
# via rich
233238
markupsafe==3.0.2
@@ -426,6 +431,8 @@ pybind11==2.13.6
426431
# via lm-eval
427432
pycparser==2.22
428433
# via cffi
434+
pycryptodomex==3.22.0
435+
# via blobfile
429436
pydantic==2.9.2
430437
# via
431438
# datamodel-code-generator
@@ -689,6 +696,7 @@ tzdata==2024.2
689696
# via pandas
690697
urllib3==2.2.3
691698
# via
699+
# blobfile
692700
# botocore
693701
# requests
694702
# responses

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,18 @@
318318
use_tokenizer_eos=True,
319319
patch_hf_runner=model_utils.internvl_patch_hf_runner,
320320
),
321+
"kimi_vl": VLMTestInfo(
322+
models=["moonshotai/Kimi-VL-A3B-Instruct"],
323+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
324+
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
325+
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
326+
max_model_len=8192,
327+
max_num_seqs=2,
328+
dtype="bfloat16",
329+
tensor_parallel_size=1,
330+
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
331+
marks=[large_gpu_mark(min_gb=48)],
332+
),
321333
"llama4": VLMTestInfo(
322334
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
323335
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,17 @@ def qwen2_vllm_to_hf_output(
6868
return output_ids, hf_output_str, out_logprobs
6969

7070

71+
def kimiv_vl_vllm_to_hf_output(
72+
vllm_output: RunnerOutput,
73+
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
74+
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
75+
output_ids, output_str, out_logprobs = vllm_output
76+
77+
hf_output_str = output_str + "<|im_end|>[EOS]"
78+
79+
return output_ids, hf_output_str, out_logprobs
80+
81+
7182
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
7283
model: str) -> RunnerOutput:
7384
config = AutoConfig.from_pretrained(model)

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ def _test_processing_correctness_mistral(
258258
"OpenGVLab/InternVL2-1B",
259259
"HuggingFaceM4/Idefics3-8B-Llama3",
260260
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
261+
"moonshotai/Kimi-VL-A3B-Instruct",
261262
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
262263
"llava-hf/llava-1.5-7b-hf",
263264
"llava-hf/llava-v1.6-mistral-7b-hf",

tests/models/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,9 @@ def check_available_online(
302302
trust_remote_code=True),
303303
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
304304
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
305+
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
306+
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
307+
trust_remote_code=True),
305308
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
306309
min_transformers_version="4.51"),
307310
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",

vllm/entrypoints/chat_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,8 @@ def _placeholder_str(self, modality: ModalityStr,
512512
return "<|fim_prefix|><|img|><|fim_suffix|>"
513513
if model_type == "gemma3":
514514
return "<start_of_image>"
515+
if model_type == "kimi_vl":
516+
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
515517

516518
raise TypeError(f"Unknown {modality} model type: {model_type}")
517519
elif modality == "audio":

0 commit comments

Comments
 (0)