@@ -566,6 +566,25 @@ def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -
566566 """
567567 return [f'[{ self ._get_bbox_str (bbox )} ]' ]
568568
569+ def _pre_tokenize_images (self , context_list : List [Context ], loss_scale_list : List [float ],
570+ inputs : StdTemplateInputs ) -> Tuple [List [Context ], List [float ]]:
571+ # https://github.com/modelscope/ms-swift/issues/3407
572+ # Fix the bounding box position offset issue in the Qwen2.5-VL grounding task.
573+ res : List [Context ] = []
574+ res_loss_scale : List [float ] = []
575+ inputs .image_idx = 0
576+
577+ for context , loss_scale in zip (context_list , loss_scale_list ):
578+ if context == '<image>' and inputs .is_multimodal and inputs .image_idx < len (inputs .images ):
579+ c_list = self .replace_tag ('image' , inputs .image_idx , inputs )
580+ inputs .image_idx += 1
581+ loss_scale = 0.
582+ else :
583+ c_list = [context ]
584+ res += c_list
585+ res_loss_scale += [loss_scale ] * len (c_list )
586+ return res , res_loss_scale
587+
569588 def _pre_tokenize (self , context_list : List [Context ], loss_scale_list : List [float ],
570589 inputs : StdTemplateInputs ) -> Tuple [List [Context ], List [float ]]:
571590 """This method happens before tokenization, replace standard tags to the contents or input_ids needed by
@@ -577,18 +596,19 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
577596 Returns:
578597 The context_list and loss_scale_list after replacement.
579598 """
599+ context_list , loss_scale_list = self ._pre_tokenize_images (context_list , loss_scale_list , inputs )
580600 if inputs .images and inputs .objects :
581601 self .normalize_bbox (inputs )
582602 # replace tag/object/box
583603 res : List [Context ] = [] # result of context_list
584604 res_loss_scale : List [float ] = [] # result of loss_scale_list
585605
586606 # reset
587- for k in ['image' , ' video' , 'audio' , 'object' , 'box' ]:
607+ for k in ['video' , 'audio' , 'object' , 'box' ]:
588608 setattr (inputs , f'{ k } _idx' , 0 )
589609
590610 for context , loss_scale in zip (context_list , loss_scale_list ):
591- for k in ['image' , ' video' , 'audio' ]:
611+ for k in ['video' , 'audio' ]:
592612 if context == f'<{ k } >' and inputs .is_multimodal and getattr (inputs , f'{ k } _idx' ) < len (
593613 getattr (inputs , f'{ k } s' )):
594614 c_list = self .replace_tag (k , getattr (inputs , f'{ k } _idx' ), inputs )
0 commit comments