@@ -293,16 +293,29 @@ def _call_hf_processor(
293
293
294
294
pixel_values = processed_outputs .get ("pixel_values" )
295
295
if pixel_values is not None :
296
- images = mm_data ["images" ]
297
- assert isinstance (images , list )
298
-
299
- # Original output: (1, num_images, C, H, W)
300
- # New output: (num_images, C, H, W)
301
- assert (isinstance (pixel_values , list ) and len (pixel_values ) == 1 )
302
- assert (isinstance (pixel_values [0 ], list )
303
- and len (pixel_values [0 ]) == len (images ))
304
-
305
- processed_outputs ["pixel_values" ] = pixel_values [0 ]
296
+ # Before/after https://github.com/huggingface/transformers/pull/35122
297
+ if Version (TRANSFORMERS_VERSION ) <= Version ("4.48.2" ):
298
+ images = mm_data ["images" ]
299
+ assert isinstance (images , list )
300
+
301
+ # Original output: (1, num_images, C, H, W)
302
+ # New output: (num_images, C, H, W)
303
+ assert (isinstance (pixel_values , list )
304
+ and len (pixel_values ) == 1 )
305
+ assert (isinstance (pixel_values [0 ], list )
306
+ and len (pixel_values [0 ]) == len (images ))
307
+
308
+ processed_outputs ["pixel_values" ] = pixel_values [0 ]
309
+ else :
310
+ # Avoid padding since we need the output for each image to be
311
+ # independent of other images for the cache to work correctly
312
+ image_sizes = processed_outputs ["image_sizes" ]
313
+ assert len (pixel_values ) == len (image_sizes )
314
+
315
+ processed_outputs ["pixel_values" ] = [
316
+ p [:, :h , :w ]
317
+ for p , (h , w ) in zip (pixel_values , image_sizes )
318
+ ]
306
319
307
320
return processed_outputs
308
321
0 commit comments