Skip to content

Commit e1218b7

Browse files
committed
[bugfix] fix vllm qwen2_5_vl (#5473)
1 parent ae87d1a commit e1218b7

File tree

5 files changed

+23
-16
lines changed

5 files changed

+23
-16
lines changed

swift/llm/infer/infer_engine/vllm_engine.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ def _add_request(self,
280280
mm_data = {key.rstrip('s'): media_data[0]}
281281
if mm_data:
282282
llm_inputs['multi_modal_data'] = mm_data
283+
mm_processor_kwargs = inputs.get('mm_processor_kwargs')
284+
if mm_processor_kwargs:
285+
llm_inputs['mm_processor_kwargs'] = mm_processor_kwargs
283286
if self.task_type == 'embedding':
284287
from vllm.pooling_params import PoolingParams
285288
if 'task' in inspect.signature(PoolingParams).parameters:

swift/llm/template/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1181,7 +1181,10 @@ def _encode_truncated(self, inputs: StdTemplateInputs):
11811181

11821182
if self.mode in {'vllm', 'lmdeploy', 'sglang'}:
11831183
encoded = Template._encode(self, inputs)
1184-
for key in ['images', 'audios', 'videos']:
1184+
keys = ['images', 'audios', 'videos']
1185+
if self.mode == 'vllm':
1186+
keys.append('mm_processor_kwargs')
1187+
for key in keys:
11851188
value = getattr(inputs, key)
11861189
if value:
11871190
encoded[key] = value

swift/llm/template/template/kwai.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
4444
video, video_kwargs = fetch_video({'video': video}, return_video_sample_fps=True)
4545
if isinstance(video, torch.Tensor):
4646
video = video.to(torch.uint8)
47-
inputs.videos[index] = (video, video_kwargs)
47+
inputs.videos[index] = video
48+
inputs.mm_processor_kwargs.setdefault('fps', []).append(video_kwargs)
4849
return ['<|vision_start|><|video_pad|><|vision_end|>']
4950

5051
def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
@@ -53,15 +54,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
5354
input_ids = encoded['input_ids']
5455
labels = encoded['labels']
5556
loss_scale = encoded.get('loss_scale', None)
56-
57-
images = inputs.images
58-
videos = [video[0] for video in inputs.videos]
59-
fps = [video[1] for video in inputs.videos]
6057
for media_type in ['images', 'videos']:
61-
if locals()[media_type]:
58+
mm_data = getattr(inputs, media_type)
59+
if mm_data:
6260
if media_type == 'images':
6361
media_token = self.image_token_id
64-
media_inputs = processor.image_processor(images=images, return_tensors='pt', do_resize=False)
62+
media_inputs = processor.image_processor(images=mm_data, return_tensors='pt', do_resize=False)
6563
media_grid_thw = media_inputs['image_grid_thw']
6664
else:
6765
kwargs = {}
@@ -70,9 +68,10 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
7068
else:
7169
processor_func = processor.image_processor
7270
kwargs['images'] = None
73-
media_inputs = processor_func(videos=videos, return_tensors='pt', do_resize=False, **kwargs)
71+
media_inputs = processor_func(videos=mm_data, return_tensors='pt', do_resize=False, **kwargs)
7472
media_grid_thw = media_inputs['video_grid_thw']
7573
media_token = self.video_token_id
74+
fps = inputs.mm_processor_kwargs['fps']
7675
media_inputs['second_per_grid_ts'] = [
7776
processor.image_processor.temporal_patch_size / tmp for tmp in fps
7877
]

swift/llm/template/template/qwen.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,9 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
244244
video, video_kwargs = fetch_video({'video': video}, return_video_sample_fps=True)
245245
if isinstance(video, torch.Tensor):
246246
video = video.to(torch.uint8)
247-
inputs.videos[index] = (video, video_kwargs)
247+
inputs.videos[index] = video
248+
if self.version == 'v2_5':
249+
inputs.mm_processor_kwargs.setdefault('fps', []).append(video_kwargs)
248250
return ['<|vision_start|><|video_pad|><|vision_end|>']
249251

250252
def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
@@ -259,14 +261,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
259261
input_ids = encoded['input_ids']
260262
labels = encoded['labels']
261263
loss_scale = encoded.get('loss_scale', None)
262-
images = inputs.images
263-
videos = [video[0] for video in inputs.videos]
264-
fps = [video[1] for video in inputs.videos]
265264
for media_type in ['images', 'videos']:
266-
if locals()[media_type]:
265+
mm_data = getattr(inputs, media_type)
266+
if mm_data:
267267
if media_type == 'images':
268268
media_token = self.image_token_id
269-
media_inputs = processor.image_processor(images=images, return_tensors='pt', do_resize=False)
269+
media_inputs = processor.image_processor(images=mm_data, return_tensors='pt', do_resize=False)
270270
media_grid_thw = media_inputs['image_grid_thw']
271271
else:
272272
kwargs = {}
@@ -275,10 +275,11 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
275275
else:
276276
processor_func = processor.image_processor
277277
kwargs['images'] = None
278-
media_inputs = processor_func(videos=videos, return_tensors='pt', do_resize=False, **kwargs)
278+
media_inputs = processor_func(videos=mm_data, return_tensors='pt', do_resize=False, **kwargs)
279279
media_grid_thw = media_inputs['video_grid_thw']
280280
media_token = self.video_token_id
281281
if self.version == 'v2_5':
282+
fps = inputs.mm_processor_kwargs['fps']
282283
media_inputs['second_per_grid_ts'] = [
283284
processor.image_processor.temporal_patch_size / tmp for tmp in fps
284285
]

swift/llm/template/template_inputs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ class StdTemplateInputs:
112112
rejected_images: List[Union[str, Image.Image]] = field(default_factory=list)
113113

114114
margin: Optional[float] = None # for reward modeling
115+
mm_processor_kwargs: Dict[str, Any] = field(default_factory=dict)
115116

116117
def __post_init__(self):
117118
self.image_idx = 0

0 commit comments

Comments
 (0)