@@ -793,6 +793,7 @@ def preprocess_inputs(
793793        image : Optional [Image ] =  None ,
794794        processor : Optional [AutoImageProcessor ] =  None ,
795795        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
796+         config : Optional [PretrainedConfig ] =  None ,
796797    ):
797798        """ 
798799        Preprocess input instruction and an image. 
@@ -969,6 +970,7 @@ def preprocess_inputs(
969970        image : Optional [Image ] =  None ,
970971        processor : Optional [AutoImageProcessor ] =  None ,
971972        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
973+         config : Optional [PretrainedConfig ] =  None ,
972974    ):
973975        if  processor  is  None :
974976            raise  ValueError ("Processor is required." )
@@ -1282,12 +1284,13 @@ def merge_vision_text_embeddings(
12821284        input_embeds  =  input_embeds .reshape (B , N , C )
12831285        return  input_embeds , attention_mask , position_ids 
12841286
1287+     @staticmethod  
12851288    def  preprocess_inputs (
1286-         self ,
12871289        text : str ,
12881290        image : Optional [Image ] =  None ,
12891291        processor : Optional [AutoImageProcessor ] =  None ,
12901292        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
1293+         config : Optional [PretrainedConfig ] =  None ,
12911294    ):
12921295        if  tokenizer  is  None :
12931296            raise  ValueError ("Tokenizer is required." )
@@ -1379,13 +1382,15 @@ def load_image(image, input_size=448, max_num=12):
13791382            return  pixel_values 
13801383
13811384        if  image  is  not   None :
1385+             if  config  is  None :
1386+                 raise  ValueError ("Config is required." )
13821387            if  "<image>"  not  in   text :
13831388                text  =  "<image>\n "  +  text 
1384-             pixel_values  =  load_image (image , input_size = self . config .vision_config .image_size )
1389+             pixel_values  =  load_image (image , input_size = config .vision_config .image_size )
13851390            num_patches  =  pixel_values .shape [0 ]
13861391            num_image_token  =  int (
1387-                 (self . config .vision_config .image_size  //  self . config .vision_config .patch_size ) **  2 
1388-                 *  (self . config .downsample_ratio ** 2 )
1392+                 (config .vision_config .image_size  //  config .vision_config .patch_size ) **  2 
1393+                 *  (config .downsample_ratio ** 2 )
13891394            )
13901395            image_tokens  =  IMG_START_TOKEN  +  IMG_CONTEXT_TOKEN  *  num_image_token  *  num_patches  +  IMG_END_TOKEN 
13911396            text  =  text .replace ("<image>" , image_tokens , 1 )
@@ -1660,6 +1665,7 @@ def preprocess_inputs(
16601665        image : Optional [Image ] =  None ,
16611666        processor : Optional [AutoImageProcessor ] =  None ,
16621667        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
1668+         config : Optional [PretrainedConfig ] =  None ,
16631669    ):
16641670        if  processor  is  None :
16651671            raise  ValueError ("Processor is required." )
@@ -1673,6 +1679,7 @@ def preprocess_inputs(
16731679                else  text 
16741680            )
16751681        inputs  =  processor ([prompt ], [image ], return_tensors = "pt" )
1682+         inputs .pop ("image_sizes" , None )
16761683        return  inputs 
16771684
16781685
@@ -1853,6 +1860,7 @@ def preprocess_inputs(
18531860        image : Optional [Image ] =  None ,
18541861        processor : Optional [AutoImageProcessor ] =  None ,
18551862        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
1863+         config : Optional [PretrainedConfig ] =  None ,
18561864    ):
18571865        if  tokenizer  is  None :
18581866            raise  ValueError ("Tokenizer is required." )
@@ -2012,6 +2020,7 @@ def preprocess_inputs(
20122020        image : Optional [Image ] =  None ,
20132021        processor : Optional [AutoImageProcessor ] =  None ,
20142022        tokenizer : Optional [PreTrainedTokenizer ] =  None ,
2023+         config : Optional [PretrainedConfig ] =  None ,
20152024    ):
20162025        if  processor  is  None :
20172026            raise  ValueError ("Processor is required." )
0 commit comments