Skip to content

Commit 912cc0c

Browse files
authored
Support Kwai-Keye/Keye-VL-8B-Preview (#4856)
1 parent 80aa7b2 commit 912cc0c

File tree

10 files changed

+178
-11
lines changed

10 files changed

+178
-11
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,6 @@
508508
|[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|✘|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
509509
|[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
510510
|[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|✘|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
511-
|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
512-
|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
513-
|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
514-
|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|✘|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
515511
|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|✘|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
516512
|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|✘|-|-|
517513
|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|✔|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
@@ -799,6 +795,7 @@
799795
|[moonshotai/Kimi-VL-A3B-Instruct](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Instruct)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)|
800796
|[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
801797
|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
798+
|[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|-|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
802799
|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
803800
|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
804801
|[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
@@ -836,6 +833,10 @@
836833
|[LLM-Research/gemma-3-12b-it](https://modelscope.cn/models/LLM-Research/gemma-3-12b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it)|
837834
|[LLM-Research/gemma-3-27b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-27b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-pt](https://huggingface.co/google/gemma-3-27b-pt)|
838835
|[LLM-Research/gemma-3-27b-it](https://modelscope.cn/models/LLM-Research/gemma-3-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)|
836+
|[google/gemma-3n-E2B](https://modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
837+
|[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
838+
|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
839+
|[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
839840
|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)|
840841
|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|
841842

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,6 @@ The table below introduces the models integrated with ms-swift:
508508
|[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|&#x2718;|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
509509
|[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
510510
|[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
511-
|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
512-
|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
513-
|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
514-
|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
515511
|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|&#x2718;|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
516512
|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|&#x2718;|-|-|
517513
|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|&#x2714;|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
@@ -799,6 +795,7 @@ The table below introduces the models integrated with ms-swift:
799795
|[moonshotai/Kimi-VL-A3B-Instruct](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Instruct)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)|
800796
|[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
801797
|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
798+
|[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|-|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
802799
|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
803800
|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
804801
|[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
@@ -836,6 +833,10 @@ The table below introduces the models integrated with ms-swift:
836833
|[LLM-Research/gemma-3-12b-it](https://modelscope.cn/models/LLM-Research/gemma-3-12b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it)|
837834
|[LLM-Research/gemma-3-27b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-27b-pt)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-pt](https://huggingface.co/google/gemma-3-27b-pt)|
838835
|[LLM-Research/gemma-3-27b-it](https://modelscope.cn/models/LLM-Research/gemma-3-27b-it)|gemma3_vision|gemma3_vision|transformers>=4.49|&#x2718;|-|[google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)|
836+
|[google/gemma-3n-E2B](https://modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
837+
|[google/gemma-3n-E4B](https://modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
838+
|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
839+
|[google/gemma-3n-E4B-it](https://modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
839840
|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Base-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503)|
840841
|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://modelscope.cn/models/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|mistral_2503|mistral_2503|transformers>=4.49|&#x2718;|-|[mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)|
841842

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class MLLMModelType:
220220
got_ocr2_hf = 'got_ocr2_hf'
221221
step_audio = 'step_audio'
222222
kimi_vl = 'kimi_vl'
223+
keye_vl = 'keye_vl'
223224

224225
phi3_vision = 'phi3_vision'
225226
phi4_multimodal = 'phi4_multimodal'

swift/llm/model/model/mllm.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal,
1515
get_model_tokenizer_with_flash_attn, register_model)
1616
from ..utils import ModelInfo, use_submodel_func
17+
from .qwen import patch_qwen_vl_utils
1718

1819
logger = get_logger()
1920

@@ -178,3 +179,26 @@ def get_model_tokenizer_megrez_omni(model_dir, *args, **kwargs):
178179
model_arch=ModelArch.qwen2_vl,
179180
architectures=['Qwen2VLForConditionalGeneration'],
180181
tags=['vision']))
182+
183+
184+
def get_model_tokenizer_keye_vl(model_dir: str, *args, **kwargs):
185+
model, processor = get_model_tokenizer_multimodal(model_dir, *args, **kwargs)
186+
from keye_vl_utils import vision_process
187+
patch_qwen_vl_utils(vision_process)
188+
return model, processor
189+
190+
191+
register_model(
192+
ModelMeta(
193+
MLLMModelType.keye_vl,
194+
[
195+
ModelGroup([
196+
Model('Kwai-Keye/Keye-VL-8B-Preview', 'Kwai-Keye/Keye-VL-8B-Preview'),
197+
]),
198+
],
199+
TemplateType.keye_vl,
200+
get_model_tokenizer_keye_vl,
201+
model_arch=ModelArch.keye_vl,
202+
architectures=['KeyeVLForConditionalGeneration'],
203+
tags=['vision'],
204+
))

swift/llm/model/model_arch.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class MLLMModelArch:
7272
valley = 'valley'
7373
gemma3n = 'gemma3n'
7474
mistral_2503 = 'mistral_2503'
75+
keye_vl = 'keye_vl'
7576

7677

7778
class ModelArch(LLMModelArch, MLLMModelArch):
@@ -603,6 +604,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
603604
vision_tower=['model.vision_tower', 'model.audio_tower'],
604605
))
605606

607+
register_model_arch(
608+
MultiModelKeys(
609+
MLLMModelArch.keye_vl,
610+
language_model='model',
611+
aligner='mlp_AR',
612+
vision_tower='visual',
613+
))
614+
606615

607616
def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
608617
return MODEL_ARCH_MAPPING.get(arch_name)

swift/llm/template/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ class MLLMTemplateType:
177177
got_ocr2_hf = 'got_ocr2_hf'
178178
step_audio = 'step_audio'
179179
kimi_vl = 'kimi_vl'
180+
keye_vl = 'keye_vl'
180181

181182
idefics3 = 'idefics3'
182183
pixtral = 'pixtral'
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, llama, llava, llm, megrez,
1+
from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm, megrez,
22
microsoft, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen, stepfun, valley,
33
yi)

swift/llm/template/template/kwai.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
2+
import os
3+
from dataclasses import dataclass, field
4+
from typing import Any, Dict, List, Literal
5+
6+
import torch
7+
from transformers.dynamic_module_utils import get_class_from_dynamic_module
8+
9+
from swift.llm import to_device
10+
from swift.utils import is_deepspeed_enabled
11+
from ..base import Template
12+
from ..constant import MLLMTemplateType
13+
from ..register import register_template
14+
from ..template_inputs import StdTemplateInputs
15+
from ..utils import Context, Word, findall
16+
from .qwen import Qwen2VLTemplate
17+
from .utils import ChatmlTemplateMeta
18+
19+
20+
@dataclass
21+
class KeyeTemplateMeta(ChatmlTemplateMeta):
22+
auto_add_bos: bool = False
23+
stop_words: List[Word] = field(default_factory=lambda: ['<|endoftext|>'])
24+
25+
26+
class KeyeVLTemplate(Template):
27+
image_token_id = 151655
28+
video_token_id = 151656
29+
placeholder_tokens = ['<|image_pad|>', '<|video_pad|>']
30+
31+
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
32+
inputs: StdTemplateInputs) -> List[Context]:
33+
from keye_vl_utils import fetch_image, fetch_video
34+
# from qwen_vl_utils import fetch_image, fetch_video
35+
assert media_type in {'image', 'video'}
36+
if media_type == 'image':
37+
inputs.images[index] = fetch_image({'image': inputs.images[index]})
38+
if getattr(self, 'mode', None) == 'lmdeploy':
39+
return ['<|vision_start|>', [-100], '<|vision_end|>']
40+
else:
41+
return ['<|vision_start|><|image_pad|><|vision_end|>']
42+
else:
43+
video = inputs.videos[index]
44+
if os.path.isdir(video):
45+
video = [os.path.join(video, fname) for fname in os.listdir(video)]
46+
video = fetch_video({'video': video})
47+
if isinstance(video, torch.Tensor):
48+
video = video.to(torch.uint8)
49+
inputs.videos[index] = video
50+
return ['<|vision_start|><|video_pad|><|vision_end|>']
51+
52+
def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
53+
from keye_vl_utils import vision_process
54+
encoded = super()._encode(inputs)
55+
processor = self.processor
56+
input_ids = encoded['input_ids']
57+
labels = encoded['labels']
58+
images = inputs.images
59+
videos = inputs.videos
60+
for media_type in ['images', 'videos']:
61+
if locals()[media_type]:
62+
if media_type == 'images':
63+
media_token = self.image_token_id
64+
media_inputs = processor.image_processor(
65+
images=images, videos=None, return_tensors='pt', do_resize=False)
66+
media_grid_thw = media_inputs['image_grid_thw']
67+
else:
68+
if hasattr(processor, 'video_processor'):
69+
processor_func = processor.video_processor
70+
else:
71+
processor_func = processor.image_processor
72+
media_inputs = processor_func(images=None, videos=videos, return_tensors='pt', do_resize=False)
73+
media_grid_thw = media_inputs['video_grid_thw']
74+
media_token = self.video_token_id
75+
media_inputs['second_per_grid_ts'] = [
76+
processor.image_processor.temporal_patch_size / vision_process.FPS
77+
] * len(media_grid_thw)
78+
idx_list = findall(input_ids, media_token)
79+
merge_length = processor.image_processor.merge_size**2
80+
81+
def _get_new_tokens(i):
82+
token_len = (media_grid_thw[i].prod() // merge_length)
83+
return [media_token] * token_len
84+
85+
input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
86+
encoded.update(media_inputs)
87+
88+
encoded['input_ids'] = input_ids
89+
encoded['labels'] = labels
90+
return encoded
91+
92+
def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
93+
res = super()._data_collator_mm_data(batch)
94+
second_per_grid_ts = self.gather_list(batch, 'second_per_grid_ts')
95+
if second_per_grid_ts:
96+
res['second_per_grid_ts'] = second_per_grid_ts
97+
for media_type in ['image', 'video']:
98+
grid_thw = self.concat_tensor(batch, f'{media_type}_grid_thw', 0)
99+
if grid_thw is not None:
100+
res[f'{media_type}_grid_thw'] = grid_thw
101+
return res
102+
103+
104+
# Register the Keye VL template
105+
register_template(KeyeTemplateMeta(MLLMTemplateType.keye_vl, template_cls=KeyeVLTemplate))

tests/test_align/test_template/test_video.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,16 @@ def test_glm4_1v():
162162
assert response == response2
163163

164164

165+
def test_keye_vl():
166+
pt_engine = PtEngine('Kwai-Keye/Keye-VL-8B-Preview', attn_impl='flash_attention_2')
167+
messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
168+
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
169+
response = _infer_model(pt_engine, messages=messages, videos=videos)
170+
pt_engine.default_template.template_backend = 'jinja'
171+
response2 = _infer_model(pt_engine, messages=messages, videos=videos)
172+
assert response == response2
173+
174+
165175
if __name__ == '__main__':
166176
from swift.llm import PtEngine, RequestConfig
167177
from swift.utils import get_logger, seed_everything
@@ -176,4 +186,5 @@ def test_glm4_1v():
176186
# test_valley()
177187
# test_qwen2_5_vl()
178188
# test_qwen2_5_omni()
179-
test_glm4_1v() # bug now, wait model fix
189+
# test_glm4_1v() # bug now, wait model fix
190+
test_keye_vl()

0 commit comments

Comments
 (0)