[Bugfix] Add min image resolution requirement for vLLM Qwen-VL models (#737)

zch42 · Luodian · coderabbitai[bot] · web-flow · commit 68aa3c0207aa · 2025-07-07T21:55:03.000+08:00
* Add min image resolution requirement for vLLM Qwen-VL models

* more robust Qwen model detection

* Update lmms_eval/models/vllm.py

Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: Li Bo &lt;drluodian@gmail.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/lmms_eval/models/vllm.py b/lmms_eval/models/vllm.py
@@ -40,6 +40,7 @@ def __init__(
         threads: int = 16,  # Threads to use for decoding visuals
         trust_remote_code: Optional[bool] = True,
         chat_template: Optional[str] = None,
+        min_image_pixels: int = 28,  # minimum image dimension, required for Qwen 2/2.5-VL models
         **kwargs,
     ) -> None:
         super().__init__()
@@ -50,6 +51,9 @@ def __init__(
         self.max_frame_num = max_frame_num
         self.threads = threads
         self.chat_template = chat_template
+        self.min_image_pixels = min_image_pixels
+        # Qwen 2/2.5-VL models enforce minimum image dimensions
+        self._enforce_image_resize = self._is_qwen_vl_model(model_version)
 
         # Convert any string arguments that start with { and end with } to dictionaries
         for key, value in kwargs.items():
@@ -85,13 +89,32 @@ def __init__(
         self.device = self.accelerator.device
         self.batch_size_per_gpu = int(batch_size)
 
+    def _is_qwen_vl_model(self, model_version: str) -> bool:
+        qwen_vl_patterns = ["qwen2-vl", "qwen2.5-vl"]
+        return any(pattern in model_version.lower() for pattern in qwen_vl_patterns)
+
+    def _maybe_resize_image(self, img: Image.Image) -> Image.Image:
+        # edge‐case validation
+        if self.min_image_pixels <= 0:
+            return img
+        if min(img.size) <= 0:
+            raise ValueError(f"Invalid image dimensions: {img.size}")
+
+        if not self._enforce_image_resize or min(img.size) >= self.min_image_pixels:
+            return img
+
+        scale = self.min_image_pixels / min(img.size)  # maintain original aspect ratio
+        new_size = tuple(int(dim * scale) for dim in img.size)
+        return img.resize(new_size, Image.BICUBIC)
+
     # Function to encode the image
     def encode_image(self, image: Union[Image.Image, str]):
         if isinstance(image, str):
             img = Image.open(image).convert("RGB")
         else:
             img = image.copy()
 
+        img = self._maybe_resize_image(img)
         output_buffer = BytesIO()
         img.save(output_buffer, format="PNG")
         byte_data = output_buffer.getvalue()
@@ -115,6 +138,7 @@ def encode_video(self, video_path):
         base64_frames = []
         for frame in frames:
             img = Image.fromarray(frame)
+            img = self._maybe_resize_image(img)
             output_buffer = BytesIO()
             img.save(output_buffer, format="PNG")
             byte_data = output_buffer.getvalue()