[docs] update register mllm docs (#6282)

Jintao-Huang · Jintao-Huang · commit 05e7137f08e9 · 2025-10-24T13:49:46.000+08:00
diff --git a/docs/source/BestPractices/注册多模态模型.md b/docs/source/BestPractices/注册多模态模型.md
diff --git a/docs/source/Customization/自定义模型.md b/docs/source/Customization/自定义模型.md
@@ -4,7 +4,7 @@ ms-swift内置的模型，你可以直接通过指定model_id或者model_path来
 
 每种model_type都有唯一的模型结构、template和加载方式。当然，你也可以手动传入`--model_type`、`--template`来进行覆盖。ms-swift已支持的model_type和template可以查看[支持的模型与数据集](../Instruction/支持的模型和数据集.md)。
 
-以下介绍如何注册一个新模型和对应的template。
+以下介绍如何注册一个新模型和对应的template。最佳实践参考[注册多模态模型最佳实践](../BestPractices/注册多模态模型最佳实践.md)。
 
 ## 模型注册
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -57,6 +57,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO代码训练.md
    BestPractices/Qwen3最佳实践.md
    BestPractices/Qwen3-VL最佳实践.md
+   BestPractices/注册多模态模型.md
    BestPractices/Embedding训练.md
    BestPractices/Reranker训练.md
    BestPractices/快速训练VL模型.md
diff --git a/docs/source_en/BestPractices/MLLM-Registration.md b/docs/source_en/BestPractices/MLLM-Registration.md
diff --git a/docs/source_en/Customization/Custom-model.md b/docs/source_en/Customization/Custom-model.md
@@ -4,7 +4,7 @@ The models built into ms-swift can be used directly by specifying either `model_
 
 Each `model_type` has a unique model structure, template, and loading method. Of course, you can also manually override these by passing `--model_type` and `--template`. You can check the supported `model_type` and templates in the [Supported Models and Datasets](../Instruction/Supported-models-and-datasets.md).
 
-The following introduces how to register a new model and its corresponding template.
+The following introduces how to register a new model and its corresponding template. For best practices, refer to [Best Practices for Registering Multimodal Models](../BestPractices/MLLM-Registration.md).
 
 ## Model Registration
 
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
@@ -59,6 +59,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO-Code-Training.md
    BestPractices/Qwen3-Best-Practice.md
    BestPractices/Qwen3-VL-Best-Practice.md
+   BestPractices/MLLM-Registration.md
    BestPractices/Embedding.md
    BestPractices/Reranker.md
    BestPractices/Rapidly-Training-VL-model.md
diff --git a/examples/custom/my_qwen2_5_omni/my_register.py b/examples/custom/my_qwen2_5_omni/my_register.py
diff --git a/examples/custom/my_qwen2_5_omni/test_register.py b/examples/custom/my_qwen2_5_omni/test_register.py
@@ -0,0 +1,89 @@
+import os
+import sys
+
+import requests
+from modelscope import snapshot_download
+from qwen_omni_utils import process_mm_info
+from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+
+from swift.llm import InferRequest, PtEngine, RequestConfig
+
+sys.path.append('examples/custom/my_qwen2_5_omni')
+
+
+def infer_hf():
+    model_dir = snapshot_download('Qwen/Qwen2.5-Omni-7B')
+    model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        model_dir, torch_dtype='auto', device_map='auto', attn_implementation='flash_attention_2')
+    processor = Qwen2_5OmniProcessor.from_pretrained(model_dir)
+    # Use decord to read video (url not yet supported)
+    resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
+    with open('_baby.mp4', 'wb') as f:
+        f.write(resp.content)
+
+    conversation = [
+        {
+            'role':
+            'user',
+            'content': [
+                {
+                    'type': 'video',
+                    'video': '_baby.mp4'
+                },
+                {
+                    'type': 'image',
+                    'image': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'
+                },
+                {
+                    'type': 'text',
+                    'text': 'Describe the video and image.'
+                },
+            ],
+        },
+    ]
+
+    USE_AUDIO_IN_VIDEO = False
+    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+    inputs = processor(
+        text=text,
+        audio=audios,
+        images=images,
+        videos=videos,
+        return_tensors='pt',
+        padding=True,
+        use_audio_in_video=USE_AUDIO_IN_VIDEO)
+    inputs = inputs.to(model.device).to(model.dtype)
+    text_ids = model.generate(
+        **inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, thinker_do_sample=False, return_audio=False)
+    text = processor.batch_decode(
+        text_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    return inputs['input_ids'][0].tolist(), text[0]
+
+
+def test_my_qwen2_5_omni():
+    engine = PtEngine('Qwen/Qwen2.5-Omni-7B', model_type='my_qwen2_5_omni', attn_impl='flash_attention_2')
+    infer_request = InferRequest(
+        messages=[{
+            'role': 'user',
+            'content': '<video><image>Describe the video and image.',
+        }],
+        videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'],
+        images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'],
+    )
+    request_config = RequestConfig(temperature=0, max_tokens=512)
+    input_ids = engine.default_template.encode(infer_request)['input_ids']
+    resp_list = engine.infer([infer_request], request_config)
+    resp = resp_list[0].choices[0].message.content
+    return input_ids, resp
+
+
+if __name__ == '__main__':
+    import my_register
+    # Enable debug mode, will print input_ids and generate_ids from `PtEngine.infer`
+    os.environ['SWIFT_DEBUG'] = '1'
+    input_ids_hf, response_hf = infer_hf()
+    input_ids_swift, response_swift = test_my_qwen2_5_omni()
+    # Test input_ids and response alignment
+    assert input_ids_hf == input_ids_swift
+    assert response_hf == response_swift
diff --git a/examples/custom/my_qwen2_5_omni/train.py b/examples/custom/my_qwen2_5_omni/train.py
@@ -0,0 +1,42 @@
+import os
+import sys
+
+from swift.llm import TrainArguments, sft_main
+
+sys.path.append('examples/custom/my_qwen2_5_omni')
+
+if __name__ == '__main__':
+    import my_register
+    os.environ['MAX_PIXELS'] = '1003520'
+    sft_main(
+        TrainArguments(
+            model='Qwen/Qwen2.5-Omni-7B',
+            dataset='AI-ModelScope/LaTeX_OCR#5000',
+            model_type='my_qwen2_5_omni',
+            template='my_qwen2_5_omni',
+            load_from_cache_file=True,
+            split_dataset_ratio=0.01,
+            train_type='lora',
+            torch_dtype='bfloat16',
+            attn_impl='flash_attn',
+            padding_free=True,
+            num_train_epochs=1,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=16,
+            learning_rate=1e-4,
+            lora_rank=8,
+            lora_alpha=32,
+            target_modules='all-linear',
+            freeze_vit=True,
+            freeze_aligner=True,
+            gradient_accumulation_steps=1,
+            eval_steps=50,
+            save_steps=50,
+            save_total_limit=2,
+            logging_steps=5,
+            max_length=2048,
+            output_dir='output',
+            warmup_ratio=0.05,
+            dataloader_num_workers=4,
+            dataset_num_proc=1,
+        ))