[Bugfix] Follow-up fix on MediaWithBytes (vllm-project#29951)

ywang96 · Patryk999 · commit a2d83d6980a8 · 2025-12-04T09:49:43.000Z
Signed-off-by: Roger Wang &lt;hey@rogerw.io&gt;
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
@@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]):
 
     The wrapper delegates attribute access to the underlying media object,
     making it behave transparently like the wrapped type (e.g., PIL.Image).
+
+    NOTE: Currently, this wrapper is used only for the image modality.
     """
 
     media: _T
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
@@ -32,6 +32,7 @@
     from PIL.Image import Image
     from transformers.feature_extraction_utils import BatchFeature
 
+    from .base import MediaWithBytes
     from .processing import MultiModalHashes
 
 else:
@@ -59,7 +60,7 @@
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 
-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
@@ -484,7 +484,7 @@ def _parse_image_data(
             return ImageEmbeddingItems(data)
 
         if (
-            isinstance(data, PILImage.Image)
+            isinstance(data, (PILImage.Image, MediaWithBytes))
             or isinstance(data, (np.ndarray, torch.Tensor))
             and data.ndim == 3
         ):