Merge branch 'main' into release/3.2

Jintao-Huang · Jintao-Huang · commit e5463b2b5660 · 2025-03-04T13:01:29.000+08:00
diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -13,7 +13,7 @@
 from ..template_inputs import StdTemplateInputs
 from ..template_meta import TemplateMeta
 from ..utils import Context, Word, findall
-from ..vision_utils import load_audio, load_batch
+from ..vision_utils import load_audio, load_batch, load_video_ovis2
 from .llama import Llama3TemplateMeta
 from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
 
@@ -410,10 +410,24 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         template_cls=Ovis1_6Template,
     ))
 
+
+class Ovis2Template(Ovis1_6Template):
+    nframes = 12
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return [[-200], '\n']
+        elif media_type == 'video':
+            nframes = get_env_args('nframes', int, self.nframes)
+            inputs.images = load_video_ovis2(inputs.videos[index], nframes)
+            return [[-200] * nframes, '\n']
+
+
 register_template(
     QwenTemplateMeta(
         MLLMTemplateType.ovis2,
-        template_cls=Ovis1_6Template,
+        template_cls=Ovis2Template,
         placeholder_tokens=['<|image_pad|>', '<|video_pad|>'],
     ))
 
diff --git a/swift/llm/template/vision_utils.py b/swift/llm/template/vision_utils.py
@@ -257,3 +257,19 @@ def load_video_valley(video: Union[str, bytes]):
     video = video_reader.get_batch(np.linspace(0, len(video_reader) - 1, 8).astype(np.int_)).byte()
     images = [transforms.ToPILImage()(image.permute(2, 0, 1)).convert('RGB') for image in video]
     return images
+
+
+def load_video_ovis2(video_path, num_frames):
+    from moviepy.editor import VideoFileClip
+    with VideoFileClip(video_path) as clip:
+        total_frames = int(clip.fps * clip.duration)
+        if total_frames <= num_frames:
+            sampled_indices = range(total_frames)
+        else:
+            stride = total_frames / num_frames
+            sampled_indices = [
+                min(total_frames - 1, int((stride * i + stride * (i + 1)) / 2)) for i in range(num_frames)
+            ]
+        frames = [clip.get_frame(index / clip.fps) for index in sampled_indices]
+        frames = [Image.fromarray(frame, mode='RGB') for frame in frames]
+    return frames
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -488,7 +488,7 @@ def test_phi4_vision():
     # test_llava()
     # test_ovis1_6()
     # test_ovis1_6_llama3()
-    # test_ovis2()
+    test_ovis2()
     # test_yi_vl()
     # test_deepseek_vl()
     # test_deepseek_janus()
@@ -508,7 +508,7 @@ def test_phi4_vision():
     # test_florence()
     # test_glm_edge_v()
     # test_phi3_vision()
-    test_phi4_vision()
+    # test_phi4_vision()
     # test_internvl2_5()
     # test_internvl2_5_mpo()
     # test_mplug_owl3()