Skip to content

Commit 05e7137

Browse files
committed
[docs] update register mllm docs (#6282)
1 parent a354329 commit 05e7137

File tree

9 files changed

+1843
-2
lines changed

9 files changed

+1843
-2
lines changed

docs/source/BestPractices/注册多模态模型.md

Lines changed: 631 additions & 0 deletions
Large diffs are not rendered by default.

docs/source/Customization/自定义模型.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ms-swift内置的模型,你可以直接通过指定model_id或者model_path来
44

55
每种model_type都有唯一的模型结构、template和加载方式。当然,你也可以手动传入`--model_type``--template`来进行覆盖。ms-swift已支持的model_type和template可以查看[支持的模型与数据集](../Instruction/支持的模型和数据集.md)
66

7-
以下介绍如何注册一个新模型和对应的template。
7+
以下介绍如何注册一个新模型和对应的template。最佳实践参考[注册多模态模型最佳实践](../BestPractices/注册多模态模型最佳实践.md)
88

99
## 模型注册
1010

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Swift DOCUMENTATION
5757
BestPractices/GRPO代码训练.md
5858
BestPractices/Qwen3最佳实践.md
5959
BestPractices/Qwen3-VL最佳实践.md
60+
BestPractices/注册多模态模型.md
6061
BestPractices/Embedding训练.md
6162
BestPractices/Reranker训练.md
6263
BestPractices/快速训练VL模型.md

docs/source_en/BestPractices/MLLM-Registration.md

Lines changed: 639 additions & 0 deletions
Large diffs are not rendered by default.

docs/source_en/Customization/Custom-model.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The models built into ms-swift can be used directly by specifying either `model_
44

55
Each `model_type` has a unique model structure, template, and loading method. Of course, you can also manually override these by passing `--model_type` and `--template`. You can check the supported `model_type` and templates in the [Supported Models and Datasets](../Instruction/Supported-models-and-datasets.md).
66

7-
The following introduces how to register a new model and its corresponding template.
7+
The following introduces how to register a new model and its corresponding template. For best practices, refer to [Best Practices for Registering Multimodal Models](../BestPractices/MLLM-Registration.md).
88

99
## Model Registration
1010

docs/source_en/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ Swift DOCUMENTATION
5959
BestPractices/GRPO-Code-Training.md
6060
BestPractices/Qwen3-Best-Practice.md
6161
BestPractices/Qwen3-VL-Best-Practice.md
62+
BestPractices/MLLM-Registration.md
6263
BestPractices/Embedding.md
6364
BestPractices/Reranker.md
6465
BestPractices/Rapidly-Training-VL-model.md

examples/custom/my_qwen2_5_omni/my_register.py

Lines changed: 438 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import os
2+
import sys
3+
4+
import requests
5+
from modelscope import snapshot_download
6+
from qwen_omni_utils import process_mm_info
7+
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
8+
9+
from swift.llm import InferRequest, PtEngine, RequestConfig
10+
11+
sys.path.append('examples/custom/my_qwen2_5_omni')
12+
13+
14+
def infer_hf():
15+
model_dir = snapshot_download('Qwen/Qwen2.5-Omni-7B')
16+
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
17+
model_dir, torch_dtype='auto', device_map='auto', attn_implementation='flash_attention_2')
18+
processor = Qwen2_5OmniProcessor.from_pretrained(model_dir)
19+
# Use decord to read video (url not yet supported)
20+
resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
21+
with open('_baby.mp4', 'wb') as f:
22+
f.write(resp.content)
23+
24+
conversation = [
25+
{
26+
'role':
27+
'user',
28+
'content': [
29+
{
30+
'type': 'video',
31+
'video': '_baby.mp4'
32+
},
33+
{
34+
'type': 'image',
35+
'image': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'
36+
},
37+
{
38+
'type': 'text',
39+
'text': 'Describe the video and image.'
40+
},
41+
],
42+
},
43+
]
44+
45+
USE_AUDIO_IN_VIDEO = False
46+
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
47+
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
48+
inputs = processor(
49+
text=text,
50+
audio=audios,
51+
images=images,
52+
videos=videos,
53+
return_tensors='pt',
54+
padding=True,
55+
use_audio_in_video=USE_AUDIO_IN_VIDEO)
56+
inputs = inputs.to(model.device).to(model.dtype)
57+
text_ids = model.generate(
58+
**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, thinker_do_sample=False, return_audio=False)
59+
text = processor.batch_decode(
60+
text_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
61+
return inputs['input_ids'][0].tolist(), text[0]
62+
63+
64+
def test_my_qwen2_5_omni():
65+
engine = PtEngine('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni', attn_impl='flash_attention_2')
66+
infer_request = InferRequest(
67+
messages=[{
68+
'role': 'user',
69+
'content': '<video><image>Describe the video and image.',
70+
}],
71+
videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'],
72+
images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'],
73+
)
74+
request_config = RequestConfig(temperature=0, max_tokens=512)
75+
input_ids = engine.default_template.encode(infer_request)['input_ids']
76+
resp_list = engine.infer([infer_request], request_config)
77+
resp = resp_list[0].choices[0].message.content
78+
return input_ids, resp
79+
80+
81+
if __name__ == '__main__':
82+
import my_register
83+
# Enable debug mode, will print input_ids and generate_ids from `PtEngine.infer`
84+
os.environ['SWIFT_DEBUG'] = '1'
85+
input_ids_hf, response_hf = infer_hf()
86+
input_ids_swift, response_swift = test_my_qwen2_5_omni()
87+
# Test input_ids and response alignment
88+
assert input_ids_hf == input_ids_swift
89+
assert response_hf == response_swift
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import os
2+
import sys
3+
4+
from swift.llm import TrainArguments, sft_main
5+
6+
sys.path.append('examples/custom/my_qwen2_5_omni')
7+
8+
if __name__ == '__main__':
9+
import my_register
10+
os.environ['MAX_PIXELS'] = '1003520'
11+
sft_main(
12+
TrainArguments(
13+
model='Qwen/Qwen2.5-Omni-7B',
14+
dataset='AI-ModelScope/LaTeX_OCR#5000',
15+
model_type='my_qwen2_5_omni',
16+
template='my_qwen2_5_omni',
17+
load_from_cache_file=True,
18+
split_dataset_ratio=0.01,
19+
train_type='lora',
20+
torch_dtype='bfloat16',
21+
attn_impl='flash_attn',
22+
padding_free=True,
23+
num_train_epochs=1,
24+
per_device_train_batch_size=16,
25+
per_device_eval_batch_size=16,
26+
learning_rate=1e-4,
27+
lora_rank=8,
28+
lora_alpha=32,
29+
target_modules='all-linear',
30+
freeze_vit=True,
31+
freeze_aligner=True,
32+
gradient_accumulation_steps=1,
33+
eval_steps=50,
34+
save_steps=50,
35+
save_total_limit=2,
36+
logging_steps=5,
37+
max_length=2048,
38+
output_dir='output',
39+
warmup_ratio=0.05,
40+
dataloader_num_workers=4,
41+
dataset_num_proc=1,
42+
))

0 commit comments

Comments
 (0)