Support Kwai-Keye/Keye-VL-8B-Preview (#4856)

0russwest0 · web-flow · commit 912cc0c1f742 · 2025-07-07T21:12:52.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -508,10 +508,6 @@
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|&#x2718;|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
 |[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
-|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
-|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
-|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
-|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|&#x2718;|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
 |[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|&#x2718;|-|-|
 |[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|&#x2714;|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
@@ -799,6 +795,7 @@
 |[moonshotai/Kimi-VL-A3B-Instruct](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Instruct)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)|
 |[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
 |[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
+|[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|-|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
 |[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
 |[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
@@ -836,6 +833,10 @@
 |[LLM-Research/gemma-3-12b-it](https://modelscope.cn/models/LLM-Research/gemma-3-12b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it)|
 |[LLM-Research/gemma-3-27b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-27b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-pt](https://huggingface.co/google/gemma-3-27b-pt)|
 |[LLM-Research/gemma-3-27b-it](https://modelscope.cn/models/LLM-Research/gemma-3-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)|
+|[google/gemma-3n-E2B](https://modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
+|[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
+|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
+|[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)|
 |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|
 
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -508,10 +508,6 @@ The table below introduces the models integrated with ms-swift:
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|&#x2718;|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
 |[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
-|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
-|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
-|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
-|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|&#x2718;|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
 |[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|&#x2718;|-|-|
 |[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|&#x2714;|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
@@ -799,6 +795,7 @@ The table below introduces the models integrated with ms-swift:
 |[moonshotai/Kimi-VL-A3B-Instruct](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Instruct)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)|
 |[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
 |[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
+|[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|-|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
 |[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
 |[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
@@ -836,6 +833,10 @@ The table below introduces the models integrated with ms-swift:
 |[LLM-Research/gemma-3-12b-it](https://modelscope.cn/models/LLM-Research/gemma-3-12b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it)|
 |[LLM-Research/gemma-3-27b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-27b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-pt](https://huggingface.co/google/gemma-3-27b-pt)|
 |[LLM-Research/gemma-3-27b-it](https://modelscope.cn/models/LLM-Research/gemma-3-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)|
+|[google/gemma-3n-E2B](https://modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
+|[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
+|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
+|[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)|
 |[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|
 
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -220,6 +220,7 @@ class MLLMModelType:
     got_ocr2_hf = 'got_ocr2_hf'
     step_audio = 'step_audio'
     kimi_vl = 'kimi_vl'
+    keye_vl = 'keye_vl'
 
     phi3_vision = 'phi3_vision'
     phi4_multimodal = 'phi4_multimodal'
diff --git a/swift/llm/model/model/mllm.py b/swift/llm/model/model/mllm.py
@@ -14,6 +14,7 @@
 from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal,
                         get_model_tokenizer_with_flash_attn, register_model)
 from ..utils import ModelInfo, use_submodel_func
+from .qwen import patch_qwen_vl_utils
 
 logger = get_logger()
 
@@ -178,3 +179,26 @@ def get_model_tokenizer_megrez_omni(model_dir, *args, **kwargs):
         model_arch=ModelArch.qwen2_vl,
         architectures=['Qwen2VLForConditionalGeneration'],
         tags=['vision']))
+
+
+def get_model_tokenizer_keye_vl(model_dir: str, *args, **kwargs):
+    model, processor = get_model_tokenizer_multimodal(model_dir, *args, **kwargs)
+    from keye_vl_utils import vision_process
+    patch_qwen_vl_utils(vision_process)
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.keye_vl,
+        [
+            ModelGroup([
+                Model('Kwai-Keye/Keye-VL-8B-Preview', 'Kwai-Keye/Keye-VL-8B-Preview'),
+            ]),
+        ],
+        TemplateType.keye_vl,
+        get_model_tokenizer_keye_vl,
+        model_arch=ModelArch.keye_vl,
+        architectures=['KeyeVLForConditionalGeneration'],
+        tags=['vision'],
+    ))
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -72,6 +72,7 @@ class MLLMModelArch:
     valley = 'valley'
     gemma3n = 'gemma3n'
     mistral_2503 = 'mistral_2503'
+    keye_vl = 'keye_vl'
 
 
 class ModelArch(LLMModelArch, MLLMModelArch):
@@ -603,6 +604,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower=['model.vision_tower', 'model.audio_tower'],
     ))
 
+register_model_arch(
+    MultiModelKeys(
+        MLLMModelArch.keye_vl,
+        language_model='model',
+        aligner='mlp_AR',
+        vision_tower='visual',
+    ))
+
 
 def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
     return MODEL_ARCH_MAPPING.get(arch_name)
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -177,6 +177,7 @@ class MLLMTemplateType:
     got_ocr2_hf = 'got_ocr2_hf'
     step_audio = 'step_audio'
     kimi_vl = 'kimi_vl'
+    keye_vl = 'keye_vl'
 
     idefics3 = 'idefics3'
     pixtral = 'pixtral'
diff --git a/swift/llm/template/template/__init__.py b/swift/llm/template/template/__init__.py
@@ -1,3 +1,3 @@
-from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, llama, llava, llm, megrez,
+from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm, megrez,
                microsoft, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen, stepfun, valley,
                yi)
diff --git a/swift/llm/template/template/kwai.py b/swift/llm/template/template/kwai.py
@@ -0,0 +1,105 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal
+
+import torch
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+from swift.llm import to_device
+from swift.utils import is_deepspeed_enabled
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, Word, findall
+from .qwen import Qwen2VLTemplate
+from .utils import ChatmlTemplateMeta
+
+
+@dataclass
+class KeyeTemplateMeta(ChatmlTemplateMeta):
+    auto_add_bos: bool = False
+    stop_words: List[Word] = field(default_factory=lambda: ['<|endoftext|>'])
+
+
+class KeyeVLTemplate(Template):
+    image_token_id = 151655
+    video_token_id = 151656
+    placeholder_tokens = ['<|image_pad|>', '<|video_pad|>']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        from keye_vl_utils import fetch_image, fetch_video
+        # from qwen_vl_utils import fetch_image, fetch_video
+        assert media_type in {'image', 'video'}
+        if media_type == 'image':
+            inputs.images[index] = fetch_image({'image': inputs.images[index]})
+            if getattr(self, 'mode', None) == 'lmdeploy':
+                return ['<|vision_start|>', [-100], '<|vision_end|>']
+            else:
+                return ['<|vision_start|><|image_pad|><|vision_end|>']
+        else:
+            video = inputs.videos[index]
+            if os.path.isdir(video):
+                video = [os.path.join(video, fname) for fname in os.listdir(video)]
+            video = fetch_video({'video': video})
+            if isinstance(video, torch.Tensor):
+                video = video.to(torch.uint8)
+            inputs.videos[index] = video
+            return ['<|vision_start|><|video_pad|><|vision_end|>']
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        from keye_vl_utils import vision_process
+        encoded = super()._encode(inputs)
+        processor = self.processor
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        images = inputs.images
+        videos = inputs.videos
+        for media_type in ['images', 'videos']:
+            if locals()[media_type]:
+                if media_type == 'images':
+                    media_token = self.image_token_id
+                    media_inputs = processor.image_processor(
+                        images=images, videos=None, return_tensors='pt', do_resize=False)
+                    media_grid_thw = media_inputs['image_grid_thw']
+                else:
+                    if hasattr(processor, 'video_processor'):
+                        processor_func = processor.video_processor
+                    else:
+                        processor_func = processor.image_processor
+                    media_inputs = processor_func(images=None, videos=videos, return_tensors='pt', do_resize=False)
+                    media_grid_thw = media_inputs['video_grid_thw']
+                    media_token = self.video_token_id
+                    media_inputs['second_per_grid_ts'] = [
+                        processor.image_processor.temporal_patch_size / vision_process.FPS
+                    ] * len(media_grid_thw)
+                idx_list = findall(input_ids, media_token)
+                merge_length = processor.image_processor.merge_size**2
+
+                def _get_new_tokens(i):
+                    token_len = (media_grid_thw[i].prod() // merge_length)
+                    return [media_token] * token_len
+
+                input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+                encoded.update(media_inputs)
+
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        return encoded
+
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        res = super()._data_collator_mm_data(batch)
+        second_per_grid_ts = self.gather_list(batch, 'second_per_grid_ts')
+        if second_per_grid_ts:
+            res['second_per_grid_ts'] = second_per_grid_ts
+        for media_type in ['image', 'video']:
+            grid_thw = self.concat_tensor(batch, f'{media_type}_grid_thw', 0)
+            if grid_thw is not None:
+                res[f'{media_type}_grid_thw'] = grid_thw
+        return res
+
+
+# Register the Keye VL template
+register_template(KeyeTemplateMeta(MLLMTemplateType.keye_vl, template_cls=KeyeVLTemplate))
diff --git a/tests/test_align/test_template/test_video.py b/tests/test_align/test_template/test_video.py
@@ -162,6 +162,16 @@ def test_glm4_1v():
     assert response == response2
 
 
+def test_keye_vl():
+    pt_engine = PtEngine('Kwai-Keye/Keye-VL-8B-Preview', attn_impl='flash_attention_2')
+    messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
+    videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
+    response = _infer_model(pt_engine, messages=messages, videos=videos)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, videos=videos)
+    assert response == response2
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig
     from swift.utils import get_logger, seed_everything
@@ -176,4 +186,5 @@ def test_glm4_1v():
     # test_valley()
     # test_qwen2_5_vl()
     # test_qwen2_5_omni()
-    test_glm4_1v()  # bug now, wait model fix
+    # test_glm4_1v()  # bug now, wait model fix
+    test_keye_vl()
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py