Skip to content

Commit 56d4aef

Browse files
[VLM] Avoid unnecessary dummy multimodal data during processing (#16416)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent dd143ef commit 56d4aef

33 files changed

+434
-392
lines changed

vllm/model_executor/models/aria.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@
2121
from vllm.model_executor.model_loader.weight_utils import (
2222
default_weight_loader, maybe_remap_kv_scale_name)
2323
from vllm.multimodal import MULTIMODAL_REGISTRY
24-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
24+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
25+
MultiModalKwargs)
2526
from vllm.multimodal.parse import MultiModalDataItems
2627
from vllm.multimodal.processing import (BaseMultiModalProcessor,
2728
BaseProcessingInfo, PromptReplacement,
2829
PromptUpdate)
29-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
30+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
3031
from vllm.sequence import IntermediateTensors
3132

3233
# yapf: disable
@@ -415,31 +416,31 @@ def get_num_image_tokens(self) -> int:
415416

416417
class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
417418

418-
def get_dummy_processor_inputs(
419+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
420+
num_images = mm_counts.get("image", 0)
421+
422+
processor = self.info.get_hf_processor()
423+
image_token: str = processor.tokenizer.image_token # type: ignore
424+
425+
return image_token * num_images
426+
427+
def get_dummy_mm_data(
419428
self,
420429
seq_len: int,
421430
mm_counts: Mapping[str, int],
422-
) -> ProcessorInputs:
431+
) -> MultiModalDataDict:
423432
vision_config = self.info.get_vision_config()
424433

425434
max_image_size = vision_config.image_size
426435
num_images = mm_counts.get("image", 0)
427436

428-
mm_data = {
437+
return {
429438
"image":
430439
self._get_dummy_images(width=max_image_size,
431440
height=max_image_size,
432441
num_images=num_images)
433442
}
434443

435-
hf_processor = self.info.get_hf_processor()
436-
image_token: str = hf_processor.tokenizer.image_token # type: ignore
437-
438-
return ProcessorInputs(
439-
prompt_text=image_token * num_images,
440-
mm_data=mm_data,
441-
)
442-
443444

444445
class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
445446

vllm/model_executor/models/aya_vision.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
2121
from vllm.model_executor.sampling_metadata import SamplingMetadata
2222
from vllm.multimodal import MULTIMODAL_REGISTRY
23-
from vllm.multimodal.inputs import MultiModalKwargs
23+
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
2424
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
2525
MultiModalDataItems)
2626
from vllm.multimodal.processing import (BaseMultiModalProcessor,
2727
BaseProcessingInfo,
2828
MultiModalFieldConfig,
2929
PromptReplacement, PromptUpdate,
3030
PromptUpdateDetails)
31-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
31+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
3232
from vllm.sequence import IntermediateTensors
3333

3434
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -146,28 +146,29 @@ def get_num_patches(self, *, image_width: int, image_height: int,
146146
class AyaVisionDummyInputsBuilder(
147147
BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
148148

149-
def get_dummy_processor_inputs(
150-
self,
151-
seq_len: int,
152-
mm_counts: Mapping[str, int],
153-
) -> ProcessorInputs:
149+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
150+
num_images = mm_counts.get("image", 0)
151+
154152
processor = self.info.get_hf_processor()
155153
image_token = processor.image_token
156154

155+
return image_token * num_images
156+
157+
def get_dummy_mm_data(
158+
self,
159+
seq_len: int,
160+
mm_counts: Mapping[str, int],
161+
) -> MultiModalDataDict:
157162
num_images = mm_counts.get("image", 0)
158163
image_size = \
159164
self.info.get_image_size_with_most_features()
160165

161-
mm_data = {
166+
return {
162167
"image":
163168
self._get_dummy_images(width=image_size.width,
164169
height=image_size.height,
165170
num_images=num_images)
166171
}
167-
return ProcessorInputs(
168-
prompt_text=image_token * num_images,
169-
mm_data=mm_data,
170-
)
171172

172173

173174
class AyaVisionMultiModalProcessor(

vllm/model_executor/models/blip2.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
1616
from vllm.model_executor.sampling_metadata import SamplingMetadata
1717
from vllm.multimodal import MULTIMODAL_REGISTRY
18-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
18+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
19+
MultiModalKwargs)
1920
from vllm.multimodal.parse import MultiModalDataItems
2021
from vllm.multimodal.processing import (BaseMultiModalProcessor,
2122
BaseProcessingInfo, PromptIndexTargets,
2223
PromptInsertion, PromptUpdate)
23-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
24+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
2425
from vllm.sequence import IntermediateTensors
2526

2627
from .blip import BlipVisionModel
@@ -413,29 +414,27 @@ def get_num_image_tokens(self) -> int:
413414

414415
class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
415416

416-
def get_dummy_processor_inputs(
417+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
418+
return ""
419+
420+
def get_dummy_mm_data(
417421
self,
418422
seq_len: int,
419423
mm_counts: Mapping[str, int],
420-
) -> ProcessorInputs:
424+
) -> MultiModalDataDict:
421425
hf_config = self.info.get_hf_config()
422426
vision_config = hf_config.vision_config
423427

424428
max_image_size = vision_config.image_size
425429
num_images = mm_counts.get("image", 0)
426430

427-
mm_data = {
431+
return {
428432
"image":
429433
self._get_dummy_images(width=max_image_size,
430434
height=max_image_size,
431435
num_images=num_images)
432436
}
433437

434-
return ProcessorInputs(
435-
prompt_text="",
436-
mm_data=mm_data,
437-
)
438-
439438

440439
class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
441440

vllm/model_executor/models/chameleon.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@
3030
from vllm.model_executor.sampling_metadata import SamplingMetadata
3131
from vllm.model_executor.utils import set_weight_attrs
3232
from vllm.multimodal import MULTIMODAL_REGISTRY
33-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
33+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
34+
MultiModalKwargs)
3435
from vllm.multimodal.parse import MultiModalDataItems
3536
from vllm.multimodal.processing import (BaseMultiModalProcessor,
3637
BaseProcessingInfo, PromptReplacement,
3738
PromptUpdate, PromptUpdateDetails)
38-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
39+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
3940
from vllm.sequence import IntermediateTensors
4041

4142
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
@@ -72,28 +73,31 @@ def get_num_image_tokens(self) -> int:
7273
class ChameleonDummyInputsBuilder(
7374
BaseDummyInputsBuilder[ChameleonProcessingInfo]):
7475

75-
def get_dummy_processor_inputs(
76+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
77+
num_images = mm_counts.get("image", 0)
78+
79+
processor = self.info.get_hf_processor()
80+
image_token = processor.image_token
81+
82+
return image_token * num_images
83+
84+
def get_dummy_mm_data(
7685
self,
7786
seq_len: int,
7887
mm_counts: Mapping[str, int],
79-
) -> ProcessorInputs:
88+
) -> MultiModalDataDict:
8089
config = self.info.get_hf_config()
8190

8291
width = height = config.vq_config.resolution
8392
num_images = mm_counts.get("image", 0)
8493

85-
mm_data = {
94+
return {
8695
"image":
8796
self._get_dummy_images(width=width,
8897
height=height,
8998
num_images=num_images)
9099
}
91100

92-
return ProcessorInputs(
93-
prompt_text="<image>" * num_images,
94-
mm_data=mm_data,
95-
)
96-
97101

98102
class ChameleonMultiModalProcessor(
99103
BaseMultiModalProcessor[ChameleonProcessingInfo]):

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
2020
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
2121
from vllm.multimodal import MULTIMODAL_REGISTRY
22-
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
23-
NestedTensors)
22+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
23+
MultiModalKwargs, NestedTensors)
2424
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
2525
ImageSize, MultiModalDataItems)
2626
from vllm.multimodal.processing import (BaseMultiModalProcessor,
2727
BaseProcessingInfo, PromptReplacement,
2828
PromptUpdate)
29-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
29+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
3030
from vllm.sequence import IntermediateTensors
3131
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
3232
MlpProjectorConfig,
@@ -172,29 +172,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
172172
class DeepseekVL2DummyInputsBuilder(
173173
BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
174174

175-
def get_dummy_processor_inputs(
175+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
176+
num_images = mm_counts.get("image", 0)
177+
178+
processor = self.info.get_hf_processor()
179+
image_token = processor.image_token
180+
181+
return image_token * num_images
182+
183+
def get_dummy_mm_data(
176184
self,
177185
seq_len: int,
178186
mm_counts: Mapping[str, int],
179-
) -> ProcessorInputs:
187+
) -> MultiModalDataDict:
180188
num_images = mm_counts.get("image", 0)
181-
hf_processor = self.info.get_hf_processor()
182-
image_token: str = hf_processor.image_token
183189

184190
max_image_size = self.info.get_image_size_with_most_features()
185191

186-
mm_data = {
192+
return {
187193
"image":
188194
self._get_dummy_images(width=max_image_size.width,
189195
height=max_image_size.height,
190196
num_images=num_images)
191197
}
192198

193-
return ProcessorInputs(
194-
prompt_text=image_token * num_images,
195-
mm_data=mm_data,
196-
)
197-
198199

199200
class DeepseekVL2MultiModalProcessor(
200201
BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):

vllm/model_executor/models/florence2.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@
2121
BartScaledWordEmbedding)
2222
from vllm.model_executor.sampling_metadata import SamplingMetadata
2323
from vllm.multimodal import MULTIMODAL_REGISTRY
24-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
25-
from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
24+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
25+
MultiModalKwargs)
26+
from vllm.multimodal.parse import MultiModalDataItems
2627
from vllm.multimodal.processing import (BaseProcessingInfo,
2728
EncDecMultiModalProcessor,
2829
PromptIndexTargets, PromptInsertion,
2930
PromptUpdate)
30-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
31+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
3132
from vllm.sequence import IntermediateTensors
3233

3334
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
@@ -772,27 +773,25 @@ def get_num_image_tokens(self) -> int:
772773
class Florence2DummyInputsBuilder(
773774
BaseDummyInputsBuilder[Florence2ProcessingInfo]):
774775

775-
def get_dummy_processor_inputs(
776+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
777+
return ""
778+
779+
def get_dummy_mm_data(
776780
self,
777781
seq_len: int,
778782
mm_counts: Mapping[str, int],
779-
) -> ProcessorInputs:
783+
) -> MultiModalDataDict:
780784
num_images = mm_counts.get("image", 0)
781785

782786
target_width = target_height = self.info.get_hf_config().projection_dim
783787

784-
mm_data = {
788+
return {
785789
"image":
786790
self._get_dummy_images(width=target_width,
787791
height=target_height,
788792
num_images=num_images)
789793
}
790794

791-
return ProcessorInputs(
792-
prompt_text="",
793-
mm_data=mm_data,
794-
)
795-
796795

797796
class Florence2MultiModalProcessor(
798797
EncDecMultiModalProcessor[Florence2ProcessingInfo]):

vllm/model_executor/models/fuyu.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,14 @@
3131
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
3232
from vllm.model_executor.sampling_metadata import SamplingMetadata
3333
from vllm.multimodal import MULTIMODAL_REGISTRY
34-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
34+
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
35+
MultiModalKwargs)
3536
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
3637
MultiModalDataItems)
3738
from vllm.multimodal.processing import (BaseMultiModalProcessor,
3839
BaseProcessingInfo, PromptReplacement,
3940
PromptUpdate, PromptUpdateDetails)
40-
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
41+
from vllm.multimodal.profiling import BaseDummyInputsBuilder
4142
from vllm.sequence import IntermediateTensors
4243

4344
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -125,27 +126,25 @@ def get_image_size_with_most_features(self) -> ImageSize:
125126

126127
class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
127128

128-
def get_dummy_processor_inputs(
129+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
130+
return ""
131+
132+
def get_dummy_mm_data(
129133
self,
130134
seq_len: int,
131135
mm_counts: Mapping[str, int],
132-
) -> ProcessorInputs:
136+
) -> MultiModalDataDict:
133137
target_width, target_height = \
134138
self.info.get_image_size_with_most_features()
135139
num_images = mm_counts.get("image", 0)
136140

137-
mm_data = {
141+
return {
138142
"image":
139143
self._get_dummy_images(width=target_width,
140144
height=target_height,
141145
num_images=num_images)
142146
}
143147

144-
return ProcessorInputs(
145-
prompt_text="",
146-
mm_data=mm_data,
147-
)
148-
149148

150149
class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
151150

0 commit comments

Comments
 (0)