Skip to content

Commit fbd07c5

Browse files
ywang96nopperl
authored andcommitted
[Multimodal] Generate mm_hash based on request metadata when caching is turned off (vllm-project#23690)
Signed-off-by: Roger Wang <[email protected]>
1 parent 24a7011 commit fbd07c5

File tree

12 files changed

+179
-24
lines changed

12 files changed

+179
-24
lines changed

vllm/inputs/preprocess.py

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,8 @@ def _process_multimodal(
257257
mm_processor_kwargs: Optional[Mapping[str, object]],
258258
tokenization_kwargs: Optional[dict[str, Any]] = None,
259259
lora_request: Optional[LoRARequest] = None,
260+
*,
261+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
260262
) -> MultiModalInputs:
261263
"""
262264
Apply the model's multi-modal processor to a multi-modal prompt,
@@ -273,10 +275,13 @@ def _process_multimodal(
273275
if mm_processor_kwargs is None:
274276
mm_processor_kwargs = {}
275277

276-
return mm_processor.apply(prompt,
277-
mm_data,
278-
hf_processor_mm_kwargs=mm_processor_kwargs,
279-
tokenization_kwargs=tokenization_kwargs)
278+
return mm_processor.apply(
279+
prompt,
280+
mm_data,
281+
hf_processor_mm_kwargs=mm_processor_kwargs,
282+
tokenization_kwargs=tokenization_kwargs,
283+
mm_hash_overrides=mm_hash_overrides,
284+
)
280285

281286
async def _process_multimodal_async(
282287
self,
@@ -285,6 +290,8 @@ async def _process_multimodal_async(
285290
mm_processor_kwargs: Optional[Mapping[str, object]],
286291
tokenization_kwargs: Optional[dict[str, Any]] = None,
287292
lora_request: Optional[LoRARequest] = None,
293+
*,
294+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
288295
) -> MultiModalInputs:
289296
"""
290297
Async version of
@@ -301,10 +308,13 @@ async def _process_multimodal_async(
301308
if mm_processor_kwargs is None:
302309
mm_processor_kwargs = {}
303310

304-
return mm_processor.apply(prompt,
305-
mm_data,
306-
hf_processor_mm_kwargs=mm_processor_kwargs,
307-
tokenization_kwargs=tokenization_kwargs)
311+
return mm_processor.apply(
312+
prompt,
313+
mm_data,
314+
hf_processor_mm_kwargs=mm_processor_kwargs,
315+
tokenization_kwargs=tokenization_kwargs,
316+
mm_hash_overrides=mm_hash_overrides,
317+
)
308318

309319
def _process_embeds(
310320
self,
@@ -341,6 +351,8 @@ def _process_tokens(
341351
parsed_content: TokensPrompt,
342352
tokenization_kwargs: Optional[dict[str, Any]] = None,
343353
lora_request: Optional[LoRARequest] = None,
354+
*,
355+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
344356
) -> Union[TokenInputs, MultiModalInputs]:
345357
prompt_token_ids = parsed_content["prompt_token_ids"]
346358
token_type_ids = parsed_content.get("token_type_ids")
@@ -353,6 +365,7 @@ def _process_tokens(
353365
parsed_content.get("mm_processor_kwargs"),
354366
tokenization_kwargs=tokenization_kwargs,
355367
lora_request=lora_request,
368+
mm_hash_overrides=mm_hash_overrides,
356369
)
357370
else:
358371
inputs = token_inputs(
@@ -370,6 +383,8 @@ async def _process_tokens_async(
370383
parsed_content: TokensPrompt,
371384
tokenization_kwargs: Optional[dict[str, Any]] = None,
372385
lora_request: Optional[LoRARequest] = None,
386+
*,
387+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
373388
) -> Union[TokenInputs, MultiModalInputs]:
374389
prompt_token_ids = parsed_content["prompt_token_ids"]
375390
token_type_ids = parsed_content.get("token_type_ids")
@@ -382,6 +397,7 @@ async def _process_tokens_async(
382397
parsed_content.get("mm_processor_kwargs"),
383398
tokenization_kwargs=tokenization_kwargs,
384399
lora_request=lora_request,
400+
mm_hash_overrides=mm_hash_overrides,
385401
)
386402
else:
387403
inputs = token_inputs(
@@ -399,6 +415,8 @@ def _process_text(
399415
parsed_content: TextPrompt,
400416
tokenization_kwargs: Optional[dict[str, Any]] = None,
401417
lora_request: Optional[LoRARequest] = None,
418+
*,
419+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
402420
) -> Union[TokenInputs, MultiModalInputs]:
403421
prompt_text = parsed_content["prompt"]
404422

@@ -410,6 +428,7 @@ def _process_text(
410428
parsed_content.get("mm_processor_kwargs"),
411429
tokenization_kwargs=tokenization_kwargs,
412430
lora_request=lora_request,
431+
mm_hash_overrides=mm_hash_overrides,
413432
)
414433
else:
415434
prompt_token_ids = self._tokenize_prompt(
@@ -432,6 +451,8 @@ async def _process_text_async(
432451
parsed_content: TextPrompt,
433452
tokenization_kwargs: Optional[dict[str, Any]] = None,
434453
lora_request: Optional[LoRARequest] = None,
454+
*,
455+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
435456
) -> Union[TokenInputs, MultiModalInputs]:
436457
prompt_text = parsed_content["prompt"]
437458

@@ -443,6 +464,7 @@ async def _process_text_async(
443464
parsed_content.get("mm_processor_kwargs"),
444465
tokenization_kwargs=tokenization_kwargs,
445466
lora_request=lora_request,
467+
mm_hash_overrides=mm_hash_overrides,
446468
)
447469
else:
448470
prompt_token_ids = await self._tokenize_prompt_async(
@@ -465,6 +487,8 @@ def _prompt_to_llm_inputs(
465487
prompt: SingletonPrompt,
466488
tokenization_kwargs: Optional[dict[str, Any]] = None,
467489
lora_request: Optional[LoRARequest] = None,
490+
*,
491+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
468492
) -> SingletonInputs:
469493
"""
470494
Extract the singleton inputs from a prompt.
@@ -486,18 +510,21 @@ def _prompt_to_llm_inputs(
486510
return self._process_tokens(
487511
parsed["content"],
488512
lora_request=lora_request,
513+
mm_hash_overrides=mm_hash_overrides,
489514
)
490515
if parsed["type"] == "text":
491516
return self._process_text(
492517
parsed["content"],
493518
tokenization_kwargs=tokenization_kwargs,
494519
lora_request=lora_request,
520+
mm_hash_overrides=mm_hash_overrides,
495521
)
496522
if parsed["type"] == "str":
497523
return self._process_text(
498524
TextPrompt(prompt=parsed["content"]),
499525
tokenization_kwargs=tokenization_kwargs,
500526
lora_request=lora_request,
527+
mm_hash_overrides=mm_hash_overrides,
501528
)
502529

503530
assert_never(parsed)
@@ -507,6 +534,8 @@ async def _prompt_to_llm_inputs_async(
507534
prompt: SingletonPrompt,
508535
tokenization_kwargs: Optional[dict[str, Any]] = None,
509536
lora_request: Optional[LoRARequest] = None,
537+
*,
538+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
510539
) -> SingletonInputs:
511540
"""
512541
Async version of
@@ -520,18 +549,21 @@ async def _prompt_to_llm_inputs_async(
520549
return await self._process_tokens_async(
521550
parsed["content"],
522551
lora_request=lora_request,
552+
mm_hash_overrides=mm_hash_overrides,
523553
)
524554
if parsed["type"] == "text":
525555
return await self._process_text_async(
526556
parsed["content"],
527557
tokenization_kwargs=tokenization_kwargs,
528558
lora_request=lora_request,
559+
mm_hash_overrides=mm_hash_overrides,
529560
)
530561
if parsed["type"] == "str":
531562
return await self._process_text_async(
532563
TextPrompt(prompt=parsed["content"]),
533564
tokenization_kwargs=tokenization_kwargs,
534565
lora_request=lora_request,
566+
mm_hash_overrides=mm_hash_overrides,
535567
)
536568

537569
assert_never(parsed)
@@ -641,6 +673,8 @@ def _process_encoder_decoder_prompt(
641673
self,
642674
prompt: PromptType,
643675
tokenization_kwargs: Optional[dict[str, Any]] = None,
676+
*,
677+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
644678
) -> EncoderDecoderInputs:
645679
"""
646680
For encoder/decoder models only:
@@ -682,6 +716,7 @@ def _process_encoder_decoder_prompt(
682716
encoder_inputs = self._prompt_to_llm_inputs(
683717
prompt["encoder_prompt"],
684718
tokenization_kwargs=tokenization_kwargs,
719+
mm_hash_overrides=mm_hash_overrides,
685720
)
686721
if (decoder_input := prompt["decoder_prompt"]) is None:
687722
decoder_inputs = None
@@ -697,6 +732,7 @@ def _process_encoder_decoder_prompt(
697732
inputs = self._prompt_to_llm_inputs(
698733
prompt,
699734
tokenization_kwargs=tokenization_kwargs,
735+
mm_hash_overrides=mm_hash_overrides,
700736
)
701737
if self.model_config.is_multimodal_model:
702738
# Encoder-Decoder Multimodal model
@@ -712,6 +748,8 @@ async def _process_encoder_decoder_prompt_async(
712748
self,
713749
prompt: PromptType,
714750
tokenization_kwargs: Optional[dict[str, Any]] = None,
751+
*,
752+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
715753
) -> EncoderDecoderInputs:
716754
"""
717755
Async version of
@@ -724,6 +762,7 @@ async def _process_encoder_decoder_prompt_async(
724762
encoder_task = self._prompt_to_llm_inputs_async(
725763
prompt["encoder_prompt"],
726764
tokenization_kwargs=tokenization_kwargs,
765+
mm_hash_overrides=mm_hash_overrides,
727766
)
728767

729768
if (decoder_input := prompt["decoder_prompt"]) is None:
@@ -733,6 +772,7 @@ async def _process_encoder_decoder_prompt_async(
733772
decoder_task = self._prompt_to_llm_inputs_async(
734773
decoder_input,
735774
tokenization_kwargs=tokenization_kwargs,
775+
mm_hash_overrides=mm_hash_overrides,
736776
)
737777

738778
encoder_inputs, decoder_inputs = await asyncio.gather(
@@ -748,6 +788,7 @@ async def _process_encoder_decoder_prompt_async(
748788
inputs = await self._prompt_to_llm_inputs_async(
749789
prompt,
750790
tokenization_kwargs=tokenization_kwargs,
791+
mm_hash_overrides=mm_hash_overrides,
751792
)
752793
if self.model_config.is_multimodal_model:
753794
# Encoder-Decoder Multimodal model
@@ -774,6 +815,8 @@ def _process_decoder_only_prompt(
774815
prompt: SingletonPrompt,
775816
tokenization_kwargs: Optional[dict[str, Any]] = None,
776817
lora_request: Optional[LoRARequest] = None,
818+
*,
819+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
777820
) -> DecoderOnlyInputs:
778821
"""
779822
For decoder-only models:
@@ -794,6 +837,7 @@ def _process_decoder_only_prompt(
794837
prompt,
795838
tokenization_kwargs=tokenization_kwargs,
796839
lora_request=lora_request,
840+
mm_hash_overrides=mm_hash_overrides,
797841
)
798842

799843
return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -803,6 +847,8 @@ async def _process_decoder_only_prompt_async(
803847
prompt: SingletonPrompt,
804848
tokenization_kwargs: Optional[dict[str, Any]] = None,
805849
lora_request: Optional[LoRARequest] = None,
850+
*,
851+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
806852
) -> DecoderOnlyInputs:
807853
"""
808854
Async version of
@@ -812,6 +858,7 @@ async def _process_decoder_only_prompt_async(
812858
prompt,
813859
tokenization_kwargs=tokenization_kwargs,
814860
lora_request=lora_request,
861+
mm_hash_overrides=mm_hash_overrides,
815862
)
816863

817864
return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -821,6 +868,8 @@ def preprocess(
821868
prompt: PromptType,
822869
tokenization_kwargs: Optional[dict[str, Any]] = None,
823870
lora_request: Optional[LoRARequest] = None,
871+
*,
872+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
824873
) -> ProcessorInputs:
825874
"""Preprocess the input prompt."""
826875
if self.model_config.is_encoder_decoder:
@@ -829,6 +878,7 @@ def preprocess(
829878
return self._process_encoder_decoder_prompt(
830879
prompt,
831880
tokenization_kwargs,
881+
mm_hash_overrides=mm_hash_overrides,
832882
)
833883

834884
if is_explicit_encoder_decoder_prompt(prompt):
@@ -840,13 +890,16 @@ def preprocess(
840890
prompt,
841891
tokenization_kwargs=tokenization_kwargs,
842892
lora_request=lora_request,
893+
mm_hash_overrides=mm_hash_overrides,
843894
)
844895

845896
async def preprocess_async(
846897
self,
847898
prompt: PromptType,
848899
tokenization_kwargs: Optional[dict[str, Any]] = None,
849900
lora_request: Optional[LoRARequest] = None,
901+
*,
902+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
850903
) -> ProcessorInputs:
851904
"""
852905
Async version of
@@ -858,6 +911,7 @@ async def preprocess_async(
858911
return await self._process_encoder_decoder_prompt_async(
859912
prompt,
860913
tokenization_kwargs,
914+
mm_hash_overrides=mm_hash_overrides,
861915
)
862916

863917
if is_explicit_encoder_decoder_prompt(prompt):
@@ -869,6 +923,7 @@ async def preprocess_async(
869923
prompt,
870924
tokenization_kwargs=tokenization_kwargs,
871925
lora_request=lora_request,
926+
mm_hash_overrides=mm_hash_overrides,
872927
)
873928

874929
def clear_cache(self) -> None:

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ def _cached_apply_hf_processor(
290290
mm_data_items: MultiModalDataItems,
291291
hf_processor_mm_kwargs: Mapping[str, object],
292292
tokenization_kwargs: Mapping[str, object],
293+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
293294
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
294295
# The processor logic is different for len(images) <= 2 vs > 2
295296
# Since the processing cache assumes that the processor output is
@@ -301,13 +302,15 @@ def _cached_apply_hf_processor(
301302
mm_data_items=mm_data_items,
302303
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
303304
tokenization_kwargs=tokenization_kwargs,
305+
mm_hash_overrides=mm_hash_overrides,
304306
)
305307

306308
return super()._cached_apply_hf_processor(
307309
prompt=prompt,
308310
mm_data_items=mm_data_items,
309311
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
310312
tokenization_kwargs=tokenization_kwargs,
313+
mm_hash_overrides=mm_hash_overrides,
311314
)
312315

313316

vllm/model_executor/models/h2ovl.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ def _cached_apply_hf_processor(
479479
mm_data_items: MultiModalDataItems,
480480
hf_processor_mm_kwargs: Mapping[str, object],
481481
tokenization_kwargs: Mapping[str, object],
482+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
482483
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
483484
# The processor logic is different for len(images) <= 1 vs > 1
484485
# Since the processing cache assumes that the processor output is
@@ -490,13 +491,15 @@ def _cached_apply_hf_processor(
490491
mm_data_items=mm_data_items,
491492
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
492493
tokenization_kwargs=tokenization_kwargs,
494+
mm_hash_overrides=mm_hash_overrides,
493495
)
494496

495497
return super()._cached_apply_hf_processor(
496498
prompt=prompt,
497499
mm_data_items=mm_data_items,
498500
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
499501
tokenization_kwargs=tokenization_kwargs,
502+
mm_hash_overrides=mm_hash_overrides,
500503
)
501504

502505

vllm/model_executor/models/llava.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,7 @@ def apply(
795795
mm_data: MultiModalDataDict,
796796
hf_processor_mm_kwargs: Mapping[str, object],
797797
tokenization_kwargs: Optional[Mapping[str, object]] = None,
798+
mm_hash_overrides: Optional[dict[str, list[str]]] = None,
798799
) -> MultiModalInputs:
799800
hf_config = self.info.get_hf_config()
800801
image_token_id = hf_config.image_token_index
@@ -805,8 +806,11 @@ def apply(
805806
image_height=-1,
806807
)
807808

808-
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
809-
tokenization_kwargs)
809+
result = super().apply(prompt,
810+
mm_data,
811+
hf_processor_mm_kwargs,
812+
tokenization_kwargs,
813+
mm_hash_overrides=mm_hash_overrides)
810814

811815
mm_items = self._to_mm_items(mm_data)
812816
mm_item_counts = mm_items.get_all_counts()

0 commit comments

Comments
 (0)