|
19 | 19 | from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
20 | 20 | from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
21 | 21 | from vllm.multimodal import MULTIMODAL_REGISTRY
|
22 |
| -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, |
23 |
| - NestedTensors) |
| 22 | +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, |
| 23 | + MultiModalKwargs, NestedTensors) |
24 | 24 | from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
25 | 25 | ImageSize, MultiModalDataItems)
|
26 | 26 | from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
27 | 27 | BaseProcessingInfo, PromptReplacement,
|
28 | 28 | PromptUpdate)
|
29 |
| -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs |
| 29 | +from vllm.multimodal.profiling import BaseDummyInputsBuilder |
30 | 30 | from vllm.sequence import IntermediateTensors
|
31 | 31 | from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
|
32 | 32 | MlpProjectorConfig,
|
@@ -172,29 +172,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
|
172 | 172 | class DeepseekVL2DummyInputsBuilder(
|
173 | 173 | BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
|
174 | 174 |
|
175 |
| - def get_dummy_processor_inputs( |
| 175 | + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: |
| 176 | + num_images = mm_counts.get("image", 0) |
| 177 | + |
| 178 | + processor = self.info.get_hf_processor() |
| 179 | + image_token = processor.image_token |
| 180 | + |
| 181 | + return image_token * num_images |
| 182 | + |
| 183 | + def get_dummy_mm_data( |
176 | 184 | self,
|
177 | 185 | seq_len: int,
|
178 | 186 | mm_counts: Mapping[str, int],
|
179 |
| - ) -> ProcessorInputs: |
| 187 | + ) -> MultiModalDataDict: |
180 | 188 | num_images = mm_counts.get("image", 0)
|
181 |
| - hf_processor = self.info.get_hf_processor() |
182 |
| - image_token: str = hf_processor.image_token |
183 | 189 |
|
184 | 190 | max_image_size = self.info.get_image_size_with_most_features()
|
185 | 191 |
|
186 |
| - mm_data = { |
| 192 | + return { |
187 | 193 | "image":
|
188 | 194 | self._get_dummy_images(width=max_image_size.width,
|
189 | 195 | height=max_image_size.height,
|
190 | 196 | num_images=num_images)
|
191 | 197 | }
|
192 | 198 |
|
193 |
| - return ProcessorInputs( |
194 |
| - prompt_text=image_token * num_images, |
195 |
| - mm_data=mm_data, |
196 |
| - ) |
197 |
| - |
198 | 199 |
|
199 | 200 | class DeepseekVL2MultiModalProcessor(
|
200 | 201 | BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
|
|
0 commit comments