@@ -40,6 +40,7 @@ def __init__(
40
40
threads : int = 16 , # Threads to use for decoding visuals
41
41
trust_remote_code : Optional [bool ] = True ,
42
42
chat_template : Optional [str ] = None ,
43
+ min_image_pixels : int = 28 , # minimum image dimension, required for Qwen 2/2.5-VL models
43
44
** kwargs ,
44
45
) -> None :
45
46
super ().__init__ ()
@@ -50,6 +51,9 @@ def __init__(
50
51
self .max_frame_num = max_frame_num
51
52
self .threads = threads
52
53
self .chat_template = chat_template
54
+ self .min_image_pixels = min_image_pixels
55
+ # Qwen 2/2.5-VL models enforce minimum image dimensions
56
+ self ._enforce_image_resize = self ._is_qwen_vl_model (model_version )
53
57
54
58
# Convert any string arguments that start with { and end with } to dictionaries
55
59
for key , value in kwargs .items ():
@@ -85,13 +89,32 @@ def __init__(
85
89
self .device = self .accelerator .device
86
90
self .batch_size_per_gpu = int (batch_size )
87
91
92
+ def _is_qwen_vl_model (self , model_version : str ) -> bool :
93
+ qwen_vl_patterns = ["qwen2-vl" , "qwen2.5-vl" ]
94
+ return any (pattern in model_version .lower () for pattern in qwen_vl_patterns )
95
+
96
+ def _maybe_resize_image (self , img : Image .Image ) -> Image .Image :
97
+ # edge‐case validation
98
+ if self .min_image_pixels <= 0 :
99
+ return img
100
+ if min (img .size ) <= 0 :
101
+ raise ValueError (f"Invalid image dimensions: { img .size } " )
102
+
103
+ if not self ._enforce_image_resize or min (img .size ) >= self .min_image_pixels :
104
+ return img
105
+
106
+ scale = self .min_image_pixels / min (img .size ) # maintain original aspect ratio
107
+ new_size = tuple (int (dim * scale ) for dim in img .size )
108
+ return img .resize (new_size , Image .BICUBIC )
109
+
88
110
# Function to encode the image
89
111
def encode_image (self , image : Union [Image .Image , str ]):
90
112
if isinstance (image , str ):
91
113
img = Image .open (image ).convert ("RGB" )
92
114
else :
93
115
img = image .copy ()
94
116
117
+ img = self ._maybe_resize_image (img )
95
118
output_buffer = BytesIO ()
96
119
img .save (output_buffer , format = "PNG" )
97
120
byte_data = output_buffer .getvalue ()
@@ -115,6 +138,7 @@ def encode_video(self, video_path):
115
138
base64_frames = []
116
139
for frame in frames :
117
140
img = Image .fromarray (frame )
141
+ img = self ._maybe_resize_image (img )
118
142
output_buffer = BytesIO ()
119
143
img .save (output_buffer , format = "PNG" )
120
144
byte_data = output_buffer .getvalue ()
0 commit comments