|
51 | 51 | BaseProcessingInfo)
|
52 | 52 | from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
53 | 53 | from vllm.sequence import IntermediateTensors
|
54 |
| -from vllm.utils import is_list_of |
55 | 54 |
|
56 | 55 | from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
57 | 56 | SupportsMultiModal, SupportsPP, SupportsQuant)
|
@@ -217,9 +216,6 @@ def wrapper(*args, **kwargs):
|
217 | 216 |
|
218 | 217 | class MultiModalProcessingInfo(BaseProcessingInfo):
|
219 | 218 |
|
220 |
| - def get_hf_config(self): |
221 |
| - return self.ctx.model_config.hf_config |
222 |
| - |
223 | 219 | def get_supported_mm_limits(self):
|
224 | 220 | return {"image": None}
|
225 | 221 |
|
@@ -784,6 +780,7 @@ def _can_concat(x: list[torch.Tensor]):
|
784 | 780 | },
|
785 | 781 | enable_if=can_enable_torch_compile)
|
786 | 782 | class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
|
| 783 | + merge_by_field_config = True |
787 | 784 | # Backwards compatibility for prev released models. State dicts back then
|
788 | 785 | # had different formats and cannot be loaded with `AutoModel` mapping as is
|
789 | 786 | hf_to_vllm_mapper = WeightsMapper(
|
@@ -828,40 +825,27 @@ def get_language_model(self) -> torch.nn.Module:
|
828 | 825 | return self.model
|
829 | 826 |
|
830 | 827 | def get_multimodal_embeddings(self, **kwargs):
|
831 |
| - pixel_values = kwargs.pop("pixel_values", None) |
832 |
| - pixel_values = pixel_values if pixel_values is not None else kwargs.pop( |
833 |
| - "image_patches", None) |
834 |
| - image_embeds = kwargs.pop("image_embeds", None) |
| 828 | + pixel_values: Optional[torch.Tensor] = kwargs.pop("pixel_values", None) |
| 829 | + image_embeds: Optional[torch.Tensor] = kwargs.pop("image_embeds", None) |
| 830 | + # Model might use `image_patches` instead of `pixel_values` |
| 831 | + if pixel_values is None: |
| 832 | + pixel_values = kwargs.pop("image_patches", None) |
835 | 833 |
|
836 | 834 | if image_embeds is not None:
|
837 | 835 | return image_embeds
|
838 | 836 |
|
839 |
| - if pixel_values is None and image_embeds is None: |
| 837 | + if pixel_values is None: |
840 | 838 | return None
|
841 | 839 |
|
842 | 840 | num_image_patches = kwargs.pop("num_image_patches")
|
843 | 841 | if pixel_values is not None:
|
844 |
| - if isinstance(pixel_values, torch.Tensor): |
845 |
| - pixel_values = flatten_bn(pixel_values).to(self.dtype) |
846 |
| - elif is_list_of(pixel_values, torch.Tensor): |
847 |
| - pixel_values = flatten_and_concat(pixel_values).to(self.dtype) |
848 |
| - else: |
849 |
| - raise ValueError( |
850 |
| - f"Unsupported pixel_values type {type(pixel_values)}. " |
851 |
| - "Expected `torch.Tensor` or list of `torch.Tensor`.") |
852 |
| - |
853 |
| - if isinstance(num_image_patches, list): |
854 |
| - num_image_patches = torch.cat(num_image_patches) |
855 |
| - |
856 | 842 | vision_embeddings = self.model.get_image_features(
|
857 |
| - pixel_values, |
858 |
| - **{ |
859 |
| - k: v.flatten(0, 1) |
860 |
| - for k, v in kwargs.items() |
861 |
| - }, |
862 |
| - ) |
| 843 | + pixel_values, **kwargs) |
863 | 844 |
|
864 | 845 | if isinstance(vision_embeddings, torch.Tensor):
|
| 846 | + if isinstance(num_image_patches, list): |
| 847 | + num_image_patches = torch.cat(num_image_patches) |
| 848 | + |
865 | 849 | if vision_embeddings.ndim == 2:
|
866 | 850 | vision_embeddings = vision_embeddings.unsqueeze(0)
|
867 | 851 |
|
|
0 commit comments