Skip to content

Commit 8693e47

Browse files
[Bugfix] Fix mm_hashes forgetting to be passed (#15668)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent cec8c7d commit 8693e47

File tree

6 files changed

+15
-10
lines changed

6 files changed

+15
-10
lines changed

vllm/inputs/preprocess.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
528528
prompt_token_ids=decoder_inputs_to_override[
529529
"prompt_token_ids"],
530530
mm_kwargs=inputs["mm_kwargs"],
531+
mm_hashes=inputs["mm_hashes"],
531532
mm_placeholders=inputs["mm_placeholders"],
532533
)
533534
else:
@@ -536,6 +537,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
536537
prompt=inputs["prompt"],
537538
prompt_token_ids=inputs["prompt_token_ids"],
538539
mm_kwargs=inputs["mm_kwargs"],
540+
mm_hashes=inputs["mm_hashes"],
539541
mm_placeholders=inputs["mm_placeholders"],
540542
)
541543
elif inputs["type"] == "token":

vllm/model_executor/models/llava.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,6 +868,7 @@ def apply(
868868
mm_items = self._to_mm_items(mm_data)
869869
mm_item_counts = mm_items.get_all_counts()
870870
mm_kwargs = result["mm_kwargs"]
871+
mm_hashes = result["mm_hashes"]
871872

872873
# We reimplement the functionality of MLlavaProcessor from
873874
# https://github.com/TIGER-AI-Lab/Mantis.git
@@ -916,6 +917,7 @@ def get_replacement_mantis(item_idx: int):
916917
prompt=prompt,
917918
prompt_token_ids=prompt_ids,
918919
mm_kwargs=mm_kwargs,
920+
mm_hashes=mm_hashes,
919921
mm_placeholders=mm_placeholder_ranges,
920922
)
921923

vllm/model_executor/models/mllama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1378,7 +1378,7 @@ def forward(
13781378
# Because attn_metadata.encoder_seq_lens only counts the last
13791379
# group of images for each sample, which is used to cheat the
13801380
# block manager to allocate blocks for those images only.
1381-
# See input_processor_for_mllama() for more details.
1381+
# See MllamaMultiModalProcessor for more details.
13821382
num_tiles_tensor = kwargs.pop("num_tiles")
13831383
num_tiles = [t.tolist() for t in num_tiles_tensor]
13841384
num_tokens_per_tile = calc_token_per_chunk(self.image_size)

vllm/model_executor/models/phi4mm.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from vllm.model_executor.models.module_mapping import MultiModelKeys
2929
from vllm.model_executor.sampling_metadata import SamplingMetadata
3030
from vllm.multimodal import MULTIMODAL_REGISTRY
31-
from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
31+
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
3232
from vllm.sequence import IntermediateTensors, SequenceData
3333
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
3434

@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
13191319

13201320

13211321
def input_mapper_for_phi4mm_audio(ctx: InputContext,
1322-
data: object) -> MultiModalInputs:
1322+
data: object) -> MultiModalKwargs:
13231323
"""
1324-
This function is used to create the MultiModalInputs for the Phi4MM
1324+
This function is used to create the MultiModalKwargs for the Phi4MM
13251325
(audio) model.
13261326
Specifically, for audio, we extract the audio features from the sound
13271327
file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
13381338
data (object): Audio data.
13391339
13401340
Returns:
1341-
MultiModalInputs: Multi-modal inputs.
1341+
MultiModalKwargs: Multi-modal inputs.
13421342
"""
13431343
if not isinstance(data, list):
13441344
data = [data]
13451345

13461346
if len(data) == 0:
1347-
return MultiModalInputs()
1347+
return MultiModalKwargs()
13481348

13491349
audio_features = []
13501350
for audio_input in data:
@@ -1365,15 +1365,15 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
13651365
[single_audio_embed_size],
13661366
)
13671367
audio_features.append(single_audio_feature_audio_len_pair)
1368-
return MultiModalInputs({"audio_features": audio_features})
1368+
return MultiModalKwargs({"audio_features": audio_features})
13691369

13701370

13711371
def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
13721372
if not isinstance(data, list):
13731373
data = [data]
13741374
# data: list of PIL images
13751375
if len(data) == 0:
1376-
return MultiModalInputs()
1376+
return MultiModalKwargs()
13771377
hf_config = ctx.get_hf_config()
13781378
vision_encoder_name = hf_config.img_processor
13791379
if vision_encoder_name is None:
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
13851385

13861386
image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
13871387
vit_patch_size)
1388-
return MultiModalInputs({
1388+
return MultiModalKwargs({
13891389
"pixel_values":
13901390
image_input_dict["pixel_values"],
13911391
"image_sizes":

vllm/model_executor/models/prithvi_geospatial_mae.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def apply(
105105
prompt=prompt,
106106
prompt_token_ids=[1],
107107
mm_kwargs=MultiModalKwargs(mm_kwargs),
108+
mm_hashes=None,
108109
mm_placeholders={},
109110
)
110111

vllm/multimodal/inputs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ class MultiModalInputs(TypedDict):
743743
mm_kwargs: MultiModalKwargs
744744
"""Keyword arguments to be directly passed to the model after batching."""
745745

746-
mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
746+
mm_hashes: Optional["MultiModalHashDict"]
747747
"""The hashes of the multi-modal data."""
748748

749749
mm_placeholders: MultiModalPlaceholderDict

0 commit comments

Comments
 (0)