Skip to content

Commit 9a35659

Browse files
committed
remove warning: unused or unrecognized kwargs images (#5357)
1 parent 75e6269 commit 9a35659

File tree

3 files changed

+14
-13
lines changed

3 files changed

+14
-13
lines changed

swift/llm/template/template/kwai.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ class KeyeVLTemplate(Template):
3030
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
3131
inputs: StdTemplateInputs) -> List[Context]:
3232
from keye_vl_utils import fetch_image, fetch_video
33-
# from qwen_vl_utils import fetch_image, fetch_video
3433
assert media_type in {'image', 'video'}
3534
if media_type == 'image':
3635
inputs.images[index] = fetch_image({'image': inputs.images[index]})
@@ -49,7 +48,6 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
4948
return ['<|vision_start|><|video_pad|><|vision_end|>']
5049

5150
def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
52-
from keye_vl_utils import vision_process
5351
encoded = super()._encode(inputs)
5452
processor = self.processor
5553
input_ids = encoded['input_ids']
@@ -63,15 +61,16 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
6361
if locals()[media_type]:
6462
if media_type == 'images':
6563
media_token = self.image_token_id
66-
media_inputs = processor.image_processor(
67-
images=images, videos=None, return_tensors='pt', do_resize=False)
64+
media_inputs = processor.image_processor(images=images, return_tensors='pt', do_resize=False)
6865
media_grid_thw = media_inputs['image_grid_thw']
6966
else:
67+
kwargs = {}
7068
if hasattr(processor, 'video_processor'):
7169
processor_func = processor.video_processor
7270
else:
7371
processor_func = processor.image_processor
74-
media_inputs = processor_func(images=None, videos=videos, return_tensors='pt', do_resize=False)
72+
kwargs['images'] = None
73+
media_inputs = processor_func(videos=videos, return_tensors='pt', do_resize=False, **kwargs)
7574
media_grid_thw = media_inputs['video_grid_thw']
7675
media_token = self.video_token_id
7776
media_inputs['second_per_grid_ts'] = [
@@ -118,7 +117,7 @@ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
118117
if is_deepspeed_enabled():
119118
from PIL import Image
120119
images = [Image.new('RGB', (32, 32), (0, 0, 0))]
121-
media_inputs = self.processor.image_processor(images=images, videos=None, return_tensors='pt')
120+
media_inputs = self.processor.image_processor(images=images, return_tensors='pt')
122121
device = input_ids.device
123122
media_inputs = to_device(media_inputs, device)
124123
pixel_values = media_inputs['pixel_values'].type(dtype)

swift/llm/template/template/qwen.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from dataclasses import dataclass, field
44
from functools import partial
5-
from typing import Any, Dict, List, Literal, Optional, Tuple
5+
from typing import Any, Dict, List, Literal, Optional
66

77
import torch
88
import torch.nn.functional as F
@@ -266,19 +266,19 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
266266
if locals()[media_type]:
267267
if media_type == 'images':
268268
media_token = self.image_token_id
269-
media_inputs = processor.image_processor(
270-
images=images, videos=None, return_tensors='pt', do_resize=False)
269+
media_inputs = processor.image_processor(images=images, return_tensors='pt', do_resize=False)
271270
media_grid_thw = media_inputs['image_grid_thw']
272271
else:
272+
kwargs = {}
273273
if hasattr(processor, 'video_processor'):
274274
processor_func = processor.video_processor
275275
else:
276276
processor_func = processor.image_processor
277-
media_inputs = processor_func(images=None, videos=videos, return_tensors='pt', do_resize=False)
277+
kwargs['images'] = None
278+
media_inputs = processor_func(videos=videos, return_tensors='pt', do_resize=False, **kwargs)
278279
media_grid_thw = media_inputs['video_grid_thw']
279280
media_token = self.video_token_id
280281
if self.version == 'v2_5':
281-
from qwen_vl_utils import vision_process
282282
media_inputs['second_per_grid_ts'] = [
283283
processor.image_processor.temporal_patch_size / tmp for tmp in fps
284284
]
@@ -335,7 +335,7 @@ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
335335
if is_deepspeed_enabled():
336336
from PIL import Image
337337
images = [Image.new('RGB', (32, 32), (0, 0, 0))]
338-
media_inputs = self.processor.image_processor(images=images, videos=None, return_tensors='pt')
338+
media_inputs = self.processor.image_processor(images=images, return_tensors='pt')
339339
device = input_ids.device
340340
media_inputs = to_device(media_inputs, device)
341341
pixel_values = media_inputs['pixel_values'].type(dtype)

swift/llm/train/sft.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,11 +300,13 @@ def _encode_dataset(self, train_dataset, val_dataset):
300300
continue
301301
if not args.lazy_tokenize and not args.streaming:
302302
preprocessor = EncodePreprocessor(template=template)
303+
batch_size = 100 if args.model_meta.is_multimodal else 1000
303304
dataset = preprocessor(
304305
dataset,
305306
num_proc=args.dataset_num_proc,
306307
load_from_cache_file=args.load_from_cache_file,
307-
strict=args.strict)
308+
strict=args.strict,
309+
batch_size=batch_size)
308310
datasets[i] = dataset
309311
template.model = origin_template_model
310312

0 commit comments

Comments
 (0)