diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 0cea1a34a5..7cd53b524b 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -213,7 +213,7 @@
 - train_dataloader_shuffle: CPT/SFT训练的dataloader是否随机，默认为True。该参数对IterableDataset无效。IterableDataset采用顺序的方式读取。
 - 🔥neftune_noise_alpha: neftune添加的噪声系数。默认为0，通常可以设置为5、10、15。
 - 🔥use_liger_kernel: 是否启用[Liger](https://github.com/linkedin/Liger-Kernel)内核加速训练并减少显存消耗。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger)。
-  - 注意：liger_kernel不支持device_map，请使用DDP/DeepSpeed进行多卡训练。
+  - 注意：liger_kernel不支持device_map，请使用DDP/DeepSpeed进行多卡训练。liger_kernel目前只支持`task_type='causal_lm'`。
 - average_tokens_across_devices: 是否在设备之间进行token数平均。如果设置为True，将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False。
 - max_grad_norm: 梯度裁剪。默认为1.。
   - 注意：日志中的grad_norm记录的是裁剪前的值。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 84a0800ced..81382e5df0 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -214,7 +214,7 @@ Other important parameters:
 - train_dataloader_shuffle: Whether to shuffle the dataloader in CPT/SFT training. Default is `True`. Not effective for `IterableDataset`, which uses sequential loading.
 - 🔥neftune_noise_alpha: Noise magnitude for NEFTune. Default is 0. Common values: 5, 10, 15.
 - 🔥use_liger_kernel: Whether to enable the [Liger](https://github.com/linkedin/Liger-Kernel) kernel to accelerate training and reduce GPU memory consumption. Defaults to False. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger).
-  - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training.
+  - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training. Currently, liger_kernel only supports `task_type='causal_lm'`.
 - average_tokens_across_devices: Whether to average token counts across devices. If `True`, `num_tokens_in_batch` is synchronized via `all_reduce` for accurate loss computation. Default is `False`.
 - max_grad_norm: Gradient clipping. Default is 1.
   - Note: The logged `grad_norm` reflects the value **before** clipping.
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 90c4f55ec5..4db26b2be5 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -7,29 +7,18 @@
 class LLMModelType:
     qwen = 'qwen'
     qwen2 = 'qwen2'
-    qwen2_5 = 'qwen2_5'
-    qwen2_5_math = 'qwen2_5_math'
     qwen2_moe = 'qwen2_moe'
-    qwq_preview = 'qwq_preview'
-    qwq = 'qwq'
     qwen3 = 'qwen3'
-    qwen3_thinking = 'qwen3_thinking'
-    qwen3_nothinking = 'qwen3_nothinking'
-    qwen3_coder = 'qwen3_coder'
     qwen3_moe = 'qwen3_moe'
-    qwen3_moe_thinking = 'qwen3_moe_thinking'
     qwen3_next = 'qwen3_next'
-    qwen3_next_thinking = 'qwen3_next_thinking'
     qwen3_emb = 'qwen3_emb'
     qwen3_reranker = 'qwen3_reranker'
-
     qwen2_gte = 'qwen2_gte'
 
     bge_reranker = 'bge_reranker'
 
     codefuse_qwen = 'codefuse_qwen'
     modelscope_agent = 'modelscope_agent'
-    marco_o1 = 'marco_o1'
 
     llama = 'llama'
     llama3 = 'llama3'
@@ -168,7 +157,6 @@ class MLLMModelType:
     qwen2_audio = 'qwen2_audio'
     qwen3_vl = 'qwen3_vl'
     qwen3_moe_vl = 'qwen3_moe_vl'
-    qvq = 'qvq'
     qwen2_gme = 'qwen2_gme'
     ovis1_6 = 'ovis1_6'
     ovis1_6_llama3 = 'ovis1_6_llama3'
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
index 568fbb4e53..fdf843a1cb 100644
--- a/swift/llm/model/model/qwen.py
+++ b/swift/llm/model/model/qwen.py
@@ -349,60 +349,51 @@ def _get_cast_dtype(self) -> torch.dtype:
             ]),
             # other
             ModelGroup([Model('PowerInfer/SmallThinker-3B-Preview', 'PowerInfer/SmallThinker-3B-Preview')]),
-        ],
-        TemplateType.qwen,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen2ForCausalLM'],
-        requires=['transformers>=4.37'],
-        model_arch=ModelArch.llama))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen2_5,
-        [
             # qwen2.5
-            ModelGroup([
-                # instruct
-                Model('Qwen/Qwen2.5-0.5B-Instruct', 'Qwen/Qwen2.5-0.5B-Instruct'),
-                Model('Qwen/Qwen2.5-1.5B-Instruct', 'Qwen/Qwen2.5-1.5B-Instruct'),
-                Model('Qwen/Qwen2.5-3B-Instruct', 'Qwen/Qwen2.5-3B-Instruct'),
-                Model('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-7B-Instruct'),
-                Model('Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-14B-Instruct'),
-                Model('Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct'),
-                Model('Qwen/Qwen2.5-72B-Instruct', 'Qwen/Qwen2.5-72B-Instruct'),
-                # base
-                Model('Qwen/Qwen2.5-0.5B', 'Qwen/Qwen2.5-0.5B'),
-                Model('Qwen/Qwen2.5-1.5B', 'Qwen/Qwen2.5-1.5B'),
-                Model('Qwen/Qwen2.5-3B', 'Qwen/Qwen2.5-3B'),
-                Model('Qwen/Qwen2.5-7B', 'Qwen/Qwen2.5-7B'),
-                Model('Qwen/Qwen2.5-14B', 'Qwen/Qwen2.5-14B'),
-                Model('Qwen/Qwen2.5-32B', 'Qwen/Qwen2.5-32B'),
-                Model('Qwen/Qwen2.5-72B', 'Qwen/Qwen2.5-72B'),
-                # gptq-int4
-                Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4'),
-                Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4'),
-                # gptq-int8
-                Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8'),
-                Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8'),
-                # awq-int4
-                Model('Qwen/Qwen2.5-0.5B-Instruct-AWQ', 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-1.5B-Instruct-AWQ', 'Qwen/Qwen2.5-1.5B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-3B-Instruct-AWQ', 'Qwen/Qwen2.5-3B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-7B-Instruct-AWQ', 'Qwen/Qwen2.5-7B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-14B-Instruct-AWQ', 'Qwen/Qwen2.5-14B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-32B-Instruct-AWQ', 'Qwen/Qwen2.5-32B-Instruct-AWQ'),
-                Model('Qwen/Qwen2.5-72B-Instruct-AWQ', 'Qwen/Qwen2.5-72B-Instruct-AWQ'),
-            ]),
+            ModelGroup(
+                [
+                    # instruct
+                    Model('Qwen/Qwen2.5-0.5B-Instruct', 'Qwen/Qwen2.5-0.5B-Instruct'),
+                    Model('Qwen/Qwen2.5-1.5B-Instruct', 'Qwen/Qwen2.5-1.5B-Instruct'),
+                    Model('Qwen/Qwen2.5-3B-Instruct', 'Qwen/Qwen2.5-3B-Instruct'),
+                    Model('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-7B-Instruct'),
+                    Model('Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-14B-Instruct'),
+                    Model('Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct'),
+                    Model('Qwen/Qwen2.5-72B-Instruct', 'Qwen/Qwen2.5-72B-Instruct'),
+                    # base
+                    Model('Qwen/Qwen2.5-0.5B', 'Qwen/Qwen2.5-0.5B'),
+                    Model('Qwen/Qwen2.5-1.5B', 'Qwen/Qwen2.5-1.5B'),
+                    Model('Qwen/Qwen2.5-3B', 'Qwen/Qwen2.5-3B'),
+                    Model('Qwen/Qwen2.5-7B', 'Qwen/Qwen2.5-7B'),
+                    Model('Qwen/Qwen2.5-14B', 'Qwen/Qwen2.5-14B'),
+                    Model('Qwen/Qwen2.5-32B', 'Qwen/Qwen2.5-32B'),
+                    Model('Qwen/Qwen2.5-72B', 'Qwen/Qwen2.5-72B'),
+                    # gptq-int4
+                    Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4'),
+                    Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4'),
+                    # gptq-int8
+                    Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8'),
+                    Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8'),
+                    # awq-int4
+                    Model('Qwen/Qwen2.5-0.5B-Instruct-AWQ', 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-1.5B-Instruct-AWQ', 'Qwen/Qwen2.5-1.5B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-3B-Instruct-AWQ', 'Qwen/Qwen2.5-3B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-7B-Instruct-AWQ', 'Qwen/Qwen2.5-7B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-14B-Instruct-AWQ', 'Qwen/Qwen2.5-14B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-32B-Instruct-AWQ', 'Qwen/Qwen2.5-32B-Instruct-AWQ'),
+                    Model('Qwen/Qwen2.5-72B-Instruct-AWQ', 'Qwen/Qwen2.5-72B-Instruct-AWQ'),
+                ],
+                TemplateType.qwen2_5),
             # qwen2.5-coder
             ModelGroup(
                 [
@@ -441,21 +432,11 @@ def _get_cast_dtype(self) -> torch.dtype:
                     Model('Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4'),
                     Model('Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8'),
                 ],
+                TemplateType.qwen2_5,
                 tags=['coding']),
             ModelGroup([
                 Model('moonshotai/Kimi-Dev-72B', 'moonshotai/Kimi-Dev-72B'),
-            ]),
-        ],
-        TemplateType.qwen2_5,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen2ForCausalLM'],
-        requires=['transformers>=4.37'],
-        model_arch=ModelArch.llama))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen2_5_math,
-        [
+            ], TemplateType.qwen2_5),
             # qwen2.5-math
             ModelGroup(
                 [
@@ -468,9 +449,16 @@ def _get_cast_dtype(self) -> torch.dtype:
                     Model('Qwen/Qwen2.5-Math-7B', 'Qwen/Qwen2.5-Math-7B'),
                     Model('Qwen/Qwen2.5-Math-72B', 'Qwen/Qwen2.5-Math-72B'),
                 ],
+                TemplateType.qwen2_5_math,
                 tags=['math']),
+            ModelGroup([Model('Qwen/QwQ-32B-Preview', 'Qwen/QwQ-32B-Preview')], TemplateType.qwq_preview),
+            ModelGroup([
+                Model('Qwen/QwQ-32B', 'Qwen/QwQ-32B'),
+                Model('Qwen/QwQ-32B-AWQ', 'Qwen/QwQ-32B-AWQ'),
+            ], TemplateType.qwq),
+            ModelGroup([Model('AIDC-AI/Marco-o1', 'AIDC-AI/Marco-o1')], TemplateType.marco_o1)
         ],
-        TemplateType.qwen2_5_math,
+        TemplateType.qwen,
         get_model_tokenizer_with_flash_attn,
         architectures=['Qwen2ForCausalLM'],
         requires=['transformers>=4.37'],
@@ -530,6 +518,14 @@ def _get_cast_dtype(self) -> torch.dtype:
                 # swift
                 Model('swift/Qwen3-32B-AWQ'),
             ]),
+            ModelGroup([
+                Model('Qwen/Qwen3-4B-Thinking-2507', 'Qwen/Qwen3-4B-Thinking-2507'),
+                Model('Qwen/Qwen3-4B-Thinking-2507-FP8', 'Qwen/Qwen3-4B-Thinking-2507-FP8'),
+            ], TemplateType.qwen3_thinking),
+            ModelGroup([
+                Model('Qwen/Qwen3-4B-Instruct-2507', 'Qwen/Qwen3-4B-Instruct-2507'),
+                Model('Qwen/Qwen3-4B-Instruct-2507-FP8', 'Qwen/Qwen3-4B-Instruct-2507-FP8'),
+            ], TemplateType.qwen3_nothinking)
         ],
         TemplateType.qwen3,
         get_model_tokenizer_with_flash_attn,
@@ -555,56 +551,17 @@ def _get_cast_dtype(self) -> torch.dtype:
             ]),
             ModelGroup([
                 Model('iic/Tongyi-DeepResearch-30B-A3B', 'Alibaba-NLP/Tongyi-DeepResearch-30B-A3B'),
-            ])
-        ],
-        TemplateType.qwen3,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen3MoeForCausalLM'],
-        requires=['transformers>=4.51'],
-    ))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen3_thinking,
-        [
-            ModelGroup([
-                Model('Qwen/Qwen3-4B-Thinking-2507', 'Qwen/Qwen3-4B-Thinking-2507'),
-                Model('Qwen/Qwen3-4B-Thinking-2507-FP8', 'Qwen/Qwen3-4B-Thinking-2507-FP8'),
-            ]),
-        ],
-        TemplateType.qwen3_thinking,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen3ForCausalLM'],
-        requires=['transformers>=4.51'],
-    ))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen3_nothinking,
-        [
-            ModelGroup([
-                Model('Qwen/Qwen3-30B-A3B-Instruct-2507', 'Qwen/Qwen3-30B-A3B-Instruct-2507'),
-                Model('Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', 'Qwen/Qwen3-30B-A3B-Instruct-2507-FP8'),
-                Model('Qwen/Qwen3-235B-A22B-Instruct-2507', 'Qwen/Qwen3-235B-A22B-Instruct-2507'),
-                Model('Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', 'Qwen/Qwen3-235B-A22B-Instruct-2507-FP8'),
-                # awq
-                Model('swift/Qwen3-235B-A22B-Instruct-2507-AWQ'),
             ]),
-            ModelGroup([
-                Model('Qwen/Qwen3-4B-Instruct-2507', 'Qwen/Qwen3-4B-Instruct-2507'),
-                Model('Qwen/Qwen3-4B-Instruct-2507-FP8', 'Qwen/Qwen3-4B-Instruct-2507-FP8'),
-            ])
-        ],
-        TemplateType.qwen3_nothinking,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM'],
-        requires=['transformers>=4.51'],
-    ))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen3_coder,
-        [
+            ModelGroup(
+                [
+                    Model('Qwen/Qwen3-30B-A3B-Instruct-2507', 'Qwen/Qwen3-30B-A3B-Instruct-2507'),
+                    Model('Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', 'Qwen/Qwen3-30B-A3B-Instruct-2507-FP8'),
+                    Model('Qwen/Qwen3-235B-A22B-Instruct-2507', 'Qwen/Qwen3-235B-A22B-Instruct-2507'),
+                    Model('Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', 'Qwen/Qwen3-235B-A22B-Instruct-2507-FP8'),
+                    # awq
+                    Model('swift/Qwen3-235B-A22B-Instruct-2507-AWQ'),
+                ],
+                TemplateType.qwen3_nothinking),
             ModelGroup([
                 Model('Qwen/Qwen3-Coder-30B-A3B-Instruct', 'Qwen/Qwen3-Coder-30B-A3B-Instruct'),
                 Model('Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8', 'Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8'),
@@ -612,9 +569,20 @@ def _get_cast_dtype(self) -> torch.dtype:
                 Model('Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8', 'Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8'),
                 Model('swift/Qwen3-Coder-480B-A35B-Instruct-AWQ'),
             ],
+                       TemplateType.qwen3_coder,
                        tags=['coding']),
+            ModelGroup(
+                [
+                    Model('Qwen/Qwen3-30B-A3B-Thinking-2507', 'Qwen/Qwen3-30B-A3B-Thinking-2507'),
+                    Model('Qwen/Qwen3-30B-A3B-Thinking-2507-FP8', 'Qwen/Qwen3-30B-A3B-Thinking-2507-FP8'),
+                    Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'),
+                    Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'),
+                    # awq
+                    Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ'),
+                ],
+                TemplateType.qwen3_thinking),
         ],
-        TemplateType.qwen3_coder,
+        TemplateType.qwen3,
         get_model_tokenizer_with_flash_attn,
         architectures=['Qwen3MoeForCausalLM'],
         requires=['transformers>=4.51'],
@@ -622,47 +590,21 @@ def _get_cast_dtype(self) -> torch.dtype:
 
 register_model(
     ModelMeta(
-        LLMModelType.qwen3_moe_thinking,
+        LLMModelType.qwen3_next,
         [
             ModelGroup([
-                Model('Qwen/Qwen3-30B-A3B-Thinking-2507', 'Qwen/Qwen3-30B-A3B-Thinking-2507'),
-                Model('Qwen/Qwen3-30B-A3B-Thinking-2507-FP8', 'Qwen/Qwen3-30B-A3B-Thinking-2507-FP8'),
-                Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'),
-                Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'),
-                # awq
-                Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ'),
+                Model('Qwen/Qwen3-Next-80B-A3B-Instruct'),
+                Model('Qwen/Qwen3-Next-80B-A3B-Instruct-FP8'),
             ]),
+            ModelGroup([
+                Model('Qwen/Qwen3-Next-80B-A3B-Thinking'),
+                Model('Qwen/Qwen3-Next-80B-A3B-Thinking-FP8'),
+            ], TemplateType.qwen3_thinking)
         ],
-        TemplateType.qwen3_thinking,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen3MoeForCausalLM'],
-        requires=['transformers>=4.51'],
-    ))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen3_next,
-        [ModelGroup([
-            Model('Qwen/Qwen3-Next-80B-A3B-Instruct'),
-            Model('Qwen/Qwen3-Next-80B-A3B-Instruct-FP8'),
-        ])],
         TemplateType.qwen3_nothinking,
         get_model_tokenizer_with_flash_attn,
         architectures=['Qwen3NextForCausalLM'],
-        requires=['transformers>=4.57.0.dev'],
-    ))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwen3_next_thinking,
-        [ModelGroup([
-            Model('Qwen/Qwen3-Next-80B-A3B-Thinking'),
-            Model('Qwen/Qwen3-Next-80B-A3B-Thinking-FP8'),
-        ])],
-        TemplateType.qwen3_thinking,
-        get_model_tokenizer_with_flash_attn,
-        architectures=['Qwen3NextForCausalLM'],
-        requires=['transformers>=4.57.0.dev'],
+        requires=['transformers>=4.57'],
     ))
 
 
@@ -773,22 +715,11 @@ def get_model_tokenizer_qwen2_vl(*args, **kwargs):
             ModelGroup([
                 Model('allenai/olmOCR-7B-0225-preview', 'allenai/olmOCR-7B-0225-preview'),
             ]),
-        ],
-        TemplateType.qwen2_vl,
-        get_model_tokenizer_qwen2_vl,
-        model_arch=ModelArch.qwen2_vl,
-        architectures=['Qwen2VLForConditionalGeneration'],
-        requires=['transformers>=4.45', 'qwen_vl_utils>=0.0.6', 'decord'],
-        tags=['vision', 'video']))
-
-register_model(
-    ModelMeta(
-        MLLMModelType.qvq, [
             ModelGroup([
                 Model('Qwen/QVQ-72B-Preview', 'Qwen/QVQ-72B-Preview'),
-            ]),
+            ], TemplateType.qvq),
         ],
-        TemplateType.qvq,
+        TemplateType.qwen2_vl,
         get_model_tokenizer_qwen2_vl,
         model_arch=ModelArch.qwen2_vl,
         architectures=['Qwen2VLForConditionalGeneration'],
@@ -855,7 +786,7 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
         get_model_tokenizer_qwen3_vl,
         model_arch=ModelArch.qwen3_vl,
         architectures=['Qwen3VLForConditionalGeneration'],
-        requires=['transformers>=4.57.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
+        requires=['transformers>=4.57', 'qwen_vl_utils>=0.0.14', 'decord'],
         tags=['vision', 'video']))
 
 
@@ -886,7 +817,7 @@ def get_model_tokenizer_qwen3_moe_vl(model_dir, *args, **kwargs):
         get_model_tokenizer_qwen3_moe_vl,
         model_arch=ModelArch.qwen3_vl,
         architectures=['Qwen3VLMoeForConditionalGeneration'],
-        requires=['transformers>=4.57.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
+        requires=['transformers>=4.57', 'qwen_vl_utils>=0.0.14', 'decord'],
         tags=['vision', 'video']))
 
 register_model(
@@ -1035,37 +966,6 @@ def get_model_tokenizer_qwen2_audio(*args, **kwargs):
         tags=['audio'],
     ))
 
-register_model(
-    ModelMeta(
-        LLMModelType.marco_o1, [ModelGroup([Model('AIDC-AI/Marco-o1', 'AIDC-AI/Marco-o1')])],
-        TemplateType.marco_o1,
-        get_model_tokenizer_with_flash_attn,
-        model_arch=ModelArch.llama,
-        architectures=['Qwen2ForCausalLM'],
-        requires=['transformers>=4.37']))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwq_preview, [ModelGroup([Model('Qwen/QwQ-32B-Preview', 'Qwen/QwQ-32B-Preview')])],
-        TemplateType.qwq_preview,
-        get_model_tokenizer_with_flash_attn,
-        model_arch=ModelArch.llama,
-        architectures=['Qwen2ForCausalLM'],
-        requires=['transformers>=4.37']))
-
-register_model(
-    ModelMeta(
-        LLMModelType.qwq,
-        [ModelGroup([
-            Model('Qwen/QwQ-32B', 'Qwen/QwQ-32B'),
-            Model('Qwen/QwQ-32B-AWQ', 'Qwen/QwQ-32B-AWQ'),
-        ])],
-        TemplateType.qwq,
-        get_model_tokenizer_with_flash_attn,
-        model_arch=ModelArch.llama,
-        architectures=['Qwen2ForCausalLM'],
-        requires=['transformers>=4.37']))
-
 
 def get_model_tokenizer_ovis(*args, **kwargs):
     kwargs['attn_impl_keys'] = ['llm_attn_implementation']
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 90765e6652..af0b52a488 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -47,6 +47,7 @@ class ModelGroup:
     models: List[Model]
 
     # Higher priority. If set to None, the attributes of the ModelMeta will be used.
+    template: Optional[str] = None
     ignore_patterns: Optional[List[str]] = None
     requires: Optional[List[str]] = None
     tags: List[str] = field(default_factory=list)