diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 4f9b551eedd4..5da2217c18d4 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -33,6 +33,7 @@ from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor from vllm.worker.hpu_model_runner import VisionBuckets from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .utils import merge_multimodal_embeddings_static logger = init_logger(__name__) @@ -598,6 +599,21 @@ def get_multimodal_embeddings( return tuple(embeddings) if embeddings else None + def get_input_embeddings_hpu( + self, + input_ids: torch.Tensor, + image_index_tensor: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.llm.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings_static( + image_index_tensor, + inputs_embeds, + multimodal_embeddings, + ) + return inputs_embeds + def get_input_embeddings( self, input_ids: torch.Tensor, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2c8786ed255c..c4300923a9d9 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1802,8 +1802,9 @@ def _prepare_prompt( dtype=torch.long, flat=self.use_merged_prefill) - if seq_group_metadata.multi_modal_data and self.is_mm_optimized and \ - 'InternVLChatModel' in str(type(self.model.model)): + if (seq_group_metadata.multi_modal_data and self.is_mm_optimized + and ('InternVLChatModel' in str(type(self.model.model)) + or 'Ovis2_5' in str(type(self.model.model)))): is_image_flatten = ( input_tokens_tensor == self.image_token_id).flatten() image_index_tensor = is_image_flatten.nonzero().squeeze(-1) @@ -2769,7 +2770,7 @@ def create_dummy_multi_modal_seq_group_metadata(self, group_id, img_args, elif "Ovis2_5" in str(type(self.model.model)): vit_cfg = self.model.model.config.vit_config self.image_token_id = getattr(self.model.model.config, - "image_token_id", -200) + "image_token_id", 151667) image_h = 128 image_w = int(img_args / image_h) num_image_tokens = int(image_h * image_w //