Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `KimiVLForConditionalGeneration`
* Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
* T + I<sup>+</sup>
* `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
*
*
* ✅︎
- * `Llama4ForConditionalGeneration`
* Llama-4-17B-Omni-Instruct
* T + I<sup>+</sup>
Expand Down
24 changes: 24 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)


# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"

prompts = [
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f"<|media_pad|><|media_end|>{question}<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>" for question in questions
]

engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
trust_remote_code=True,
)

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -962,6 +985,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
"kimi_vl": run_kimi_vl,
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
Expand Down
40 changes: 40 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,45 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)


def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=4,
tensor_parallel_size=1,
limit_mm_per_prompt={"image": len(image_urls)},
trust_remote_code=True,
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]

processor = AutoProcessor.from_pretrained(model_name,
trust_remote_code=True)

prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)


def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

Expand Down Expand Up @@ -631,6 +670,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"kimi_vl": load_kimi_vl,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
Expand Down
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pytest-timeout
# testing utils
awscli
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
Expand Down
10 changes: 9 additions & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ bitsandbytes==0.45.3
# via -r requirements/test.in
black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
boto3==1.35.57
# via tensorizer
botocore==1.35.57
Expand Down Expand Up @@ -127,6 +129,7 @@ fastsafetensors==0.1.10
# via -r requirements/test.in
filelock==3.16.1
# via
# blobfile
# datasets
# huggingface-hub
# ray
Expand Down Expand Up @@ -227,7 +230,9 @@ llvmlite==0.44.0
lm-eval==0.4.8
# via -r requirements/test.in
lxml==5.3.0
# via sacrebleu
# via
# blobfile
# sacrebleu
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.2
Expand Down Expand Up @@ -426,6 +431,8 @@ pybind11==2.13.6
# via lm-eval
pycparser==2.22
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.9.2
# via
# datamodel-code-generator
Expand Down Expand Up @@ -689,6 +696,7 @@ tzdata==2024.2
# via pandas
urllib3==2.2.3
# via
# blobfile
# botocore
# requests
# responses
Expand Down
11 changes: 11 additions & 0 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,17 @@
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
max_model_len=8192,
max_num_seqs=2,
dtype="bfloat16",
tensor_parallel_size=1,
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ def qwen2_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs


def kimiv_vl_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output

hf_output_str = output_str + "<|im_end|>[EOS]"

return output_ids, hf_output_str, out_logprobs


def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def _test_processing_correctness_mistral(
"OpenGVLab/InternVL2-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
Expand Down
3 changes: 3 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,9 @@ def check_available_online(
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
min_transformers_version="4.51"),
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,8 @@ def _placeholder_str(self, modality: ModalityStr,
return "<|fim_prefix|><|img|><|fim_suffix|>"
if model_type == "gemma3":
return "<start_of_image>"
if model_type == "kimi_vl":
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501

raise TypeError(f"Unknown {modality} model type: {model_type}")
elif modality == "audio":
Expand Down
Loading