6
6
from abc import ABC , abstractmethod
7
7
from collections import defaultdict , deque
8
8
from collections .abc import Awaitable , Iterable
9
- from functools import cache , lru_cache , partial
9
+ from functools import cached_property , lru_cache , partial
10
10
from pathlib import Path
11
11
from typing import (Any , Callable , Generic , Literal , Optional , TypeVar , Union ,
12
12
cast )
37
37
38
38
from vllm .config import ModelConfig
39
39
from vllm .logger import init_logger
40
+ from vllm .model_executor .model_loader import get_model_cls
41
+ from vllm .model_executor .models import SupportsMultiModal
40
42
from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalDataDict
41
43
from vllm .multimodal .utils import MediaConnector
42
44
# yapf: disable
@@ -492,6 +494,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
492
494
def model_config (self ) -> ModelConfig :
493
495
return self ._model_config
494
496
497
+ @cached_property
498
+ def model_cls (self ):
499
+ return get_model_cls (self .model_config )
500
+
495
501
@property
496
502
def allowed_local_media_path (self ):
497
503
return self ._model_config .allowed_local_media_path
@@ -500,96 +506,14 @@ def allowed_local_media_path(self):
500
506
def mm_registry (self ):
501
507
return MULTIMODAL_REGISTRY
502
508
503
- @staticmethod
504
- @cache
505
- def _cached_token_str (tokenizer : AnyTokenizer , token_index : int ) -> str :
506
- return tokenizer .decode (token_index )
507
-
508
- def _placeholder_str (self , modality : ModalityStr ,
509
- current_count : int ) -> Optional [str ]:
510
- if modality in self ._model_config .mm_placeholder_str_override :
511
- return self ._model_config .mm_placeholder_str_override [modality ]
512
-
513
- # TODO: Let user specify how to insert image tokens into prompt
514
- # (similar to chat template)
515
- hf_config = self ._model_config .hf_config
516
- model_type = hf_config .model_type
517
-
518
- if modality in ("image" , "image_embeds" ):
519
- if model_type == "chatglm" :
520
- return "<|begin_of_image|><|endoftext|><|end_of_image|>"
521
- if model_type == "glm4v" :
522
- return "<|begin_of_image|><|image|><|end_of_image|>"
523
- if model_type in ("phi3_v" , "phi4mm" ):
524
- return f"<|image_{ current_count } |>"
525
- if model_type in ("minicpmo" , "minicpmv" ):
526
- return "(<image>./</image>)"
527
- if model_type in ("blip-2" , "florence2" , "fuyu" , "paligemma" ,
528
- "pixtral" , "mistral3" ):
529
- # These models do not use image tokens in the prompt
530
- return None
531
- if model_type == "qwen" :
532
- return f"Picture { current_count } : <img></img>"
533
- if model_type .startswith ("llava" ):
534
- return self ._cached_token_str (self ._tokenizer ,
535
- hf_config .image_token_index )
536
-
537
- if model_type in ("aya_vision" , "chameleon" , "deepseek_vl_v2" ,
538
- "internvl_chat" , "ovis" , "skywork_chat" ,
539
- "NVLM_D" , "h2ovl_chat" , "idefics3" , "smolvlm" ):
540
- return "<image>"
541
- if model_type in ("mllama" , "llama4" ):
542
- return "<|image|>"
543
- if model_type in ("qwen2_vl" , "qwen2_5_vl" , "keye" , "Keye" ):
544
- return "<|vision_start|><|image_pad|><|vision_end|>"
545
- if model_type == "qwen2_5_omni" :
546
- return "<|vision_start|><|IMAGE|><|vision_end|>"
547
- if model_type == "molmo" :
548
- return ""
549
- if model_type == "aria" :
550
- return "<|fim_prefix|><|img|><|fim_suffix|>"
551
- if model_type == "gemma3" :
552
- return "<start_of_image>"
553
- if model_type == "kimi_vl" :
554
- return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
555
-
556
- raise TypeError (f"Unknown { modality } model type: { model_type } " )
557
- elif modality == "audio" :
558
- if model_type in ("ultravox" , "granite_speech" ):
559
- return "<|audio|>"
560
- if model_type == "phi4mm" :
561
- return f"<|audio_{ current_count } |>"
562
- if model_type in ("qwen2_audio" , "qwen2_5_omni" ):
563
- return (f"Audio { current_count } : "
564
- f"<|audio_bos|><|AUDIO|><|audio_eos|>" )
565
- if model_type == "minicpmo" :
566
- return "(<audio>./</audio>)"
567
- raise TypeError (f"Unknown model type: { model_type } " )
568
- elif modality == "video" :
569
- if model_type == "internvl_chat" :
570
- return "<video>"
571
- if model_type == "glm4v" :
572
- return "<|begin_of_video|><|video|><|end_of_video|>"
573
- if model_type in ("qwen2_vl" , "qwen2_5_vl" , "keye" , "Keye" ):
574
- return "<|vision_start|><|video_pad|><|vision_end|>"
575
- if model_type == "qwen2_5_omni" :
576
- return "<|vision_start|><|VIDEO|><|vision_end|>"
577
- if model_type in ("minicpmo" , "minicpmv" ):
578
- return "(<video>./</video>)"
579
- if model_type .startswith ("llava" ):
580
- return self ._cached_token_str (self ._tokenizer ,
581
- hf_config .video_token_index )
582
- raise TypeError (f"Unknown { modality } model type: { model_type } " )
583
- else :
584
- raise TypeError (f"Unknown modality: { modality } " )
585
-
586
509
def add (self , modality : ModalityStr , item : _T ) -> Optional [str ]:
587
510
"""
588
511
Add a multi-modal item to the current prompt and returns the
589
512
placeholder string to use, if any.
590
513
"""
591
514
mm_registry = self .mm_registry
592
515
model_config = self .model_config
516
+ model_cls = cast (SupportsMultiModal , self .model_cls )
593
517
594
518
input_modality = modality .replace ("_embeds" , "" )
595
519
@@ -614,7 +538,7 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
614
538
615
539
self ._items_by_modality [modality ].append (item )
616
540
617
- return self . _placeholder_str (modality , current_count )
541
+ return model_cls . get_placeholder_str (modality , current_count )
618
542
619
543
@abstractmethod
620
544
def create_parser (self ) -> "BaseMultiModalContentParser" :
0 commit comments