28
28
from vllm .model_executor .models .module_mapping import MultiModelKeys
29
29
from vllm .model_executor .sampling_metadata import SamplingMetadata
30
30
from vllm .multimodal import MULTIMODAL_REGISTRY
31
- from vllm .multimodal .inputs import MultiModalInputs , NestedTensors
31
+ from vllm .multimodal .inputs import MultiModalKwargs , NestedTensors
32
32
from vllm .sequence import IntermediateTensors , SequenceData
33
33
from vllm .transformers_utils .tokenizer import cached_tokenizer_from_config
34
34
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
1319
1319
1320
1320
1321
1321
def input_mapper_for_phi4mm_audio (ctx : InputContext ,
1322
- data : object ) -> MultiModalInputs :
1322
+ data : object ) -> MultiModalKwargs :
1323
1323
"""
1324
- This function is used to create the MultiModalInputs for the Phi4MM
1324
+ This function is used to create the MultiModalKwargs for the Phi4MM
1325
1325
(audio) model.
1326
1326
Specifically, for audio, we extract the audio features from the sound
1327
1327
file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
1338
1338
data (object): Audio data.
1339
1339
1340
1340
Returns:
1341
- MultiModalInputs : Multi-modal inputs.
1341
+ MultiModalKwargs : Multi-modal inputs.
1342
1342
"""
1343
1343
if not isinstance (data , list ):
1344
1344
data = [data ]
1345
1345
1346
1346
if len (data ) == 0 :
1347
- return MultiModalInputs ()
1347
+ return MultiModalKwargs ()
1348
1348
1349
1349
audio_features = []
1350
1350
for audio_input in data :
@@ -1365,15 +1365,15 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
1365
1365
[single_audio_embed_size ],
1366
1366
)
1367
1367
audio_features .append (single_audio_feature_audio_len_pair )
1368
- return MultiModalInputs ({"audio_features" : audio_features })
1368
+ return MultiModalKwargs ({"audio_features" : audio_features })
1369
1369
1370
1370
1371
1371
def input_mapper_for_phi4mm_image (ctx : InputContext , data : object ):
1372
1372
if not isinstance (data , list ):
1373
1373
data = [data ]
1374
1374
# data: list of PIL images
1375
1375
if len (data ) == 0 :
1376
- return MultiModalInputs ()
1376
+ return MultiModalKwargs ()
1377
1377
hf_config = ctx .get_hf_config ()
1378
1378
vision_encoder_name = hf_config .img_processor
1379
1379
if vision_encoder_name is None :
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
1385
1385
1386
1386
image_input_dict = preprocess (data , dynamic_hd_size , vit_image_size ,
1387
1387
vit_patch_size )
1388
- return MultiModalInputs ({
1388
+ return MultiModalKwargs ({
1389
1389
"pixel_values" :
1390
1390
image_input_dict ["pixel_values" ],
1391
1391
"image_sizes" :
0 commit comments