Skip to content

Commit e5463b2

Browse files
committed
Merge branch 'main' into release/3.2
2 parents 976fd6c + b9bc5c1 commit e5463b2

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

swift/llm/template/template/qwen.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from ..template_inputs import StdTemplateInputs
1414
from ..template_meta import TemplateMeta
1515
from ..utils import Context, Word, findall
16-
from ..vision_utils import load_audio, load_batch
16+
from ..vision_utils import load_audio, load_batch, load_video_ovis2
1717
from .llama import Llama3TemplateMeta
1818
from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
1919

@@ -410,10 +410,24 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
410410
template_cls=Ovis1_6Template,
411411
))
412412

413+
414+
class Ovis2Template(Ovis1_6Template):
415+
nframes = 12
416+
417+
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
418+
inputs: StdTemplateInputs) -> List[Context]:
419+
if media_type == 'image':
420+
return [[-200], '\n']
421+
elif media_type == 'video':
422+
nframes = get_env_args('nframes', int, self.nframes)
423+
inputs.images = load_video_ovis2(inputs.videos[index], nframes)
424+
return [[-200] * nframes, '\n']
425+
426+
413427
register_template(
414428
QwenTemplateMeta(
415429
MLLMTemplateType.ovis2,
416-
template_cls=Ovis1_6Template,
430+
template_cls=Ovis2Template,
417431
placeholder_tokens=['<|image_pad|>', '<|video_pad|>'],
418432
))
419433

swift/llm/template/vision_utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,19 @@ def load_video_valley(video: Union[str, bytes]):
257257
video = video_reader.get_batch(np.linspace(0, len(video_reader) - 1, 8).astype(np.int_)).byte()
258258
images = [transforms.ToPILImage()(image.permute(2, 0, 1)).convert('RGB') for image in video]
259259
return images
260+
261+
262+
def load_video_ovis2(video_path, num_frames):
263+
from moviepy.editor import VideoFileClip
264+
with VideoFileClip(video_path) as clip:
265+
total_frames = int(clip.fps * clip.duration)
266+
if total_frames <= num_frames:
267+
sampled_indices = range(total_frames)
268+
else:
269+
stride = total_frames / num_frames
270+
sampled_indices = [
271+
min(total_frames - 1, int((stride * i + stride * (i + 1)) / 2)) for i in range(num_frames)
272+
]
273+
frames = [clip.get_frame(index / clip.fps) for index in sampled_indices]
274+
frames = [Image.fromarray(frame, mode='RGB') for frame in frames]
275+
return frames

tests/test_align/test_template/test_vision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ def test_phi4_vision():
488488
# test_llava()
489489
# test_ovis1_6()
490490
# test_ovis1_6_llama3()
491-
# test_ovis2()
491+
test_ovis2()
492492
# test_yi_vl()
493493
# test_deepseek_vl()
494494
# test_deepseek_janus()
@@ -508,7 +508,7 @@ def test_phi4_vision():
508508
# test_florence()
509509
# test_glm_edge_v()
510510
# test_phi3_vision()
511-
test_phi4_vision()
511+
# test_phi4_vision()
512512
# test_internvl2_5()
513513
# test_internvl2_5_mpo()
514514
# test_mplug_owl3()

0 commit comments

Comments
 (0)