Skip to content

Commit 2c702ea

Browse files
authored
Fix qwen2 5 vl grounding (#3491)
1 parent 3750eda commit 2c702ea

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

swift/llm/template/base.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,25 @@ def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -
566566
"""
567567
return [f'[{self._get_bbox_str(bbox)}]']
568568

569+
def _pre_tokenize_images(self, context_list: List[Context], loss_scale_list: List[float],
570+
inputs: StdTemplateInputs) -> Tuple[List[Context], List[float]]:
571+
# https://github.com/modelscope/ms-swift/issues/3407
572+
# Fix the bounding box position offset issue in the Qwen2.5-VL grounding task.
573+
res: List[Context] = []
574+
res_loss_scale: List[float] = []
575+
inputs.image_idx = 0
576+
577+
for context, loss_scale in zip(context_list, loss_scale_list):
578+
if context == '<image>' and inputs.is_multimodal and inputs.image_idx < len(inputs.images):
579+
c_list = self.replace_tag('image', inputs.image_idx, inputs)
580+
inputs.image_idx += 1
581+
loss_scale = 0.
582+
else:
583+
c_list = [context]
584+
res += c_list
585+
res_loss_scale += [loss_scale] * len(c_list)
586+
return res, res_loss_scale
587+
569588
def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
570589
inputs: StdTemplateInputs) -> Tuple[List[Context], List[float]]:
571590
"""This method happens before tokenization, replace standard tags to the contents or input_ids needed by
@@ -577,18 +596,19 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
577596
Returns:
578597
The context_list and loss_scale_list after replacement.
579598
"""
599+
context_list, loss_scale_list = self._pre_tokenize_images(context_list, loss_scale_list, inputs)
580600
if inputs.images and inputs.objects:
581601
self.normalize_bbox(inputs)
582602
# replace tag/object/box
583603
res: List[Context] = [] # result of context_list
584604
res_loss_scale: List[float] = [] # result of loss_scale_list
585605

586606
# reset
587-
for k in ['image', 'video', 'audio', 'object', 'box']:
607+
for k in ['video', 'audio', 'object', 'box']:
588608
setattr(inputs, f'{k}_idx', 0)
589609

590610
for context, loss_scale in zip(context_list, loss_scale_list):
591-
for k in ['image', 'video', 'audio']:
611+
for k in ['video', 'audio']:
592612
if context == f'<{k}>' and inputs.is_multimodal and getattr(inputs, f'{k}_idx') < len(
593613
getattr(inputs, f'{k}s')):
594614
c_list = self.replace_tag(k, getattr(inputs, f'{k}_idx'), inputs)

tests/train/test_grounding.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import os
2+
3+
from swift.llm import TrainArguments, sft_main
4+
5+
os.environ['MAX_PIXELS'] = str(16 * 28 * 28)
6+
7+
if __name__ == '__main__':
8+
sft_main(TrainArguments(model='Qwen/Qwen2.5-VL-7B-Instruct', dataset='AI-ModelScope/coco#2000'))

0 commit comments

Comments
 (0)