diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 0cea1a34a5..7cd53b524b 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -213,7 +213,7 @@ - train_dataloader_shuffle: CPT/SFT训练的dataloader是否随机,默认为True。该参数对IterableDataset无效。IterableDataset采用顺序的方式读取。 - 🔥neftune_noise_alpha: neftune添加的噪声系数。默认为0,通常可以设置为5、10、15。 - 🔥use_liger_kernel: 是否启用[Liger](https://github.com/linkedin/Liger-Kernel)内核加速训练并减少显存消耗。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger)。 - - 注意:liger_kernel不支持device_map,请使用DDP/DeepSpeed进行多卡训练。 + - 注意:liger_kernel不支持device_map,请使用DDP/DeepSpeed进行多卡训练。liger_kernel目前只支持`task_type='causal_lm'`。 - average_tokens_across_devices: 是否在设备之间进行token数平均。如果设置为True,将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False。 - max_grad_norm: 梯度裁剪。默认为1.。 - 注意:日志中的grad_norm记录的是裁剪前的值。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 84a0800ced..81382e5df0 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -214,7 +214,7 @@ Other important parameters: - train_dataloader_shuffle: Whether to shuffle the dataloader in CPT/SFT training. Default is `True`. Not effective for `IterableDataset`, which uses sequential loading. - 🔥neftune_noise_alpha: Noise magnitude for NEFTune. Default is 0. Common values: 5, 10, 15. - 🔥use_liger_kernel: Whether to enable the [Liger](https://github.com/linkedin/Liger-Kernel) kernel to accelerate training and reduce GPU memory consumption. Defaults to False. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger). - - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training. + - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training. Currently, liger_kernel only supports `task_type='causal_lm'`. - average_tokens_across_devices: Whether to average token counts across devices. If `True`, `num_tokens_in_batch` is synchronized via `all_reduce` for accurate loss computation. Default is `False`. - max_grad_norm: Gradient clipping. Default is 1. - Note: The logged `grad_norm` reflects the value **before** clipping. diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py index 90c4f55ec5..4db26b2be5 100644 --- a/swift/llm/model/constant.py +++ b/swift/llm/model/constant.py @@ -7,29 +7,18 @@ class LLMModelType: qwen = 'qwen' qwen2 = 'qwen2' - qwen2_5 = 'qwen2_5' - qwen2_5_math = 'qwen2_5_math' qwen2_moe = 'qwen2_moe' - qwq_preview = 'qwq_preview' - qwq = 'qwq' qwen3 = 'qwen3' - qwen3_thinking = 'qwen3_thinking' - qwen3_nothinking = 'qwen3_nothinking' - qwen3_coder = 'qwen3_coder' qwen3_moe = 'qwen3_moe' - qwen3_moe_thinking = 'qwen3_moe_thinking' qwen3_next = 'qwen3_next' - qwen3_next_thinking = 'qwen3_next_thinking' qwen3_emb = 'qwen3_emb' qwen3_reranker = 'qwen3_reranker' - qwen2_gte = 'qwen2_gte' bge_reranker = 'bge_reranker' codefuse_qwen = 'codefuse_qwen' modelscope_agent = 'modelscope_agent' - marco_o1 = 'marco_o1' llama = 'llama' llama3 = 'llama3' @@ -168,7 +157,6 @@ class MLLMModelType: qwen2_audio = 'qwen2_audio' qwen3_vl = 'qwen3_vl' qwen3_moe_vl = 'qwen3_moe_vl' - qvq = 'qvq' qwen2_gme = 'qwen2_gme' ovis1_6 = 'ovis1_6' ovis1_6_llama3 = 'ovis1_6_llama3' diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py index 568fbb4e53..fdf843a1cb 100644 --- a/swift/llm/model/model/qwen.py +++ b/swift/llm/model/model/qwen.py @@ -349,60 +349,51 @@ def _get_cast_dtype(self) -> torch.dtype: ]), # other ModelGroup([Model('PowerInfer/SmallThinker-3B-Preview', 'PowerInfer/SmallThinker-3B-Preview')]), - ], - TemplateType.qwen, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen2ForCausalLM'], - requires=['transformers>=4.37'], - model_arch=ModelArch.llama)) - -register_model( - ModelMeta( - LLMModelType.qwen2_5, - [ # qwen2.5 - ModelGroup([ - # instruct - Model('Qwen/Qwen2.5-0.5B-Instruct', 'Qwen/Qwen2.5-0.5B-Instruct'), - Model('Qwen/Qwen2.5-1.5B-Instruct', 'Qwen/Qwen2.5-1.5B-Instruct'), - Model('Qwen/Qwen2.5-3B-Instruct', 'Qwen/Qwen2.5-3B-Instruct'), - Model('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-7B-Instruct'), - Model('Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-14B-Instruct'), - Model('Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct'), - Model('Qwen/Qwen2.5-72B-Instruct', 'Qwen/Qwen2.5-72B-Instruct'), - # base - Model('Qwen/Qwen2.5-0.5B', 'Qwen/Qwen2.5-0.5B'), - Model('Qwen/Qwen2.5-1.5B', 'Qwen/Qwen2.5-1.5B'), - Model('Qwen/Qwen2.5-3B', 'Qwen/Qwen2.5-3B'), - Model('Qwen/Qwen2.5-7B', 'Qwen/Qwen2.5-7B'), - Model('Qwen/Qwen2.5-14B', 'Qwen/Qwen2.5-14B'), - Model('Qwen/Qwen2.5-32B', 'Qwen/Qwen2.5-32B'), - Model('Qwen/Qwen2.5-72B', 'Qwen/Qwen2.5-72B'), - # gptq-int4 - Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4'), - Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4'), - # gptq-int8 - Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8'), - Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8'), - # awq-int4 - Model('Qwen/Qwen2.5-0.5B-Instruct-AWQ', 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-1.5B-Instruct-AWQ', 'Qwen/Qwen2.5-1.5B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-3B-Instruct-AWQ', 'Qwen/Qwen2.5-3B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-7B-Instruct-AWQ', 'Qwen/Qwen2.5-7B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-14B-Instruct-AWQ', 'Qwen/Qwen2.5-14B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-32B-Instruct-AWQ', 'Qwen/Qwen2.5-32B-Instruct-AWQ'), - Model('Qwen/Qwen2.5-72B-Instruct-AWQ', 'Qwen/Qwen2.5-72B-Instruct-AWQ'), - ]), + ModelGroup( + [ + # instruct + Model('Qwen/Qwen2.5-0.5B-Instruct', 'Qwen/Qwen2.5-0.5B-Instruct'), + Model('Qwen/Qwen2.5-1.5B-Instruct', 'Qwen/Qwen2.5-1.5B-Instruct'), + Model('Qwen/Qwen2.5-3B-Instruct', 'Qwen/Qwen2.5-3B-Instruct'), + Model('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-7B-Instruct'), + Model('Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-14B-Instruct'), + Model('Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct'), + Model('Qwen/Qwen2.5-72B-Instruct', 'Qwen/Qwen2.5-72B-Instruct'), + # base + Model('Qwen/Qwen2.5-0.5B', 'Qwen/Qwen2.5-0.5B'), + Model('Qwen/Qwen2.5-1.5B', 'Qwen/Qwen2.5-1.5B'), + Model('Qwen/Qwen2.5-3B', 'Qwen/Qwen2.5-3B'), + Model('Qwen/Qwen2.5-7B', 'Qwen/Qwen2.5-7B'), + Model('Qwen/Qwen2.5-14B', 'Qwen/Qwen2.5-14B'), + Model('Qwen/Qwen2.5-32B', 'Qwen/Qwen2.5-32B'), + Model('Qwen/Qwen2.5-72B', 'Qwen/Qwen2.5-72B'), + # gptq-int4 + Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4'), + Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4'), + # gptq-int8 + Model('Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8'), + Model('Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8'), + # awq-int4 + Model('Qwen/Qwen2.5-0.5B-Instruct-AWQ', 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-1.5B-Instruct-AWQ', 'Qwen/Qwen2.5-1.5B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-3B-Instruct-AWQ', 'Qwen/Qwen2.5-3B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-7B-Instruct-AWQ', 'Qwen/Qwen2.5-7B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-14B-Instruct-AWQ', 'Qwen/Qwen2.5-14B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-32B-Instruct-AWQ', 'Qwen/Qwen2.5-32B-Instruct-AWQ'), + Model('Qwen/Qwen2.5-72B-Instruct-AWQ', 'Qwen/Qwen2.5-72B-Instruct-AWQ'), + ], + TemplateType.qwen2_5), # qwen2.5-coder ModelGroup( [ @@ -441,21 +432,11 @@ def _get_cast_dtype(self) -> torch.dtype: Model('Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', 'Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4'), Model('Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', 'Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8'), ], + TemplateType.qwen2_5, tags=['coding']), ModelGroup([ Model('moonshotai/Kimi-Dev-72B', 'moonshotai/Kimi-Dev-72B'), - ]), - ], - TemplateType.qwen2_5, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen2ForCausalLM'], - requires=['transformers>=4.37'], - model_arch=ModelArch.llama)) - -register_model( - ModelMeta( - LLMModelType.qwen2_5_math, - [ + ], TemplateType.qwen2_5), # qwen2.5-math ModelGroup( [ @@ -468,9 +449,16 @@ def _get_cast_dtype(self) -> torch.dtype: Model('Qwen/Qwen2.5-Math-7B', 'Qwen/Qwen2.5-Math-7B'), Model('Qwen/Qwen2.5-Math-72B', 'Qwen/Qwen2.5-Math-72B'), ], + TemplateType.qwen2_5_math, tags=['math']), + ModelGroup([Model('Qwen/QwQ-32B-Preview', 'Qwen/QwQ-32B-Preview')], TemplateType.qwq_preview), + ModelGroup([ + Model('Qwen/QwQ-32B', 'Qwen/QwQ-32B'), + Model('Qwen/QwQ-32B-AWQ', 'Qwen/QwQ-32B-AWQ'), + ], TemplateType.qwq), + ModelGroup([Model('AIDC-AI/Marco-o1', 'AIDC-AI/Marco-o1')], TemplateType.marco_o1) ], - TemplateType.qwen2_5_math, + TemplateType.qwen, get_model_tokenizer_with_flash_attn, architectures=['Qwen2ForCausalLM'], requires=['transformers>=4.37'], @@ -530,6 +518,14 @@ def _get_cast_dtype(self) -> torch.dtype: # swift Model('swift/Qwen3-32B-AWQ'), ]), + ModelGroup([ + Model('Qwen/Qwen3-4B-Thinking-2507', 'Qwen/Qwen3-4B-Thinking-2507'), + Model('Qwen/Qwen3-4B-Thinking-2507-FP8', 'Qwen/Qwen3-4B-Thinking-2507-FP8'), + ], TemplateType.qwen3_thinking), + ModelGroup([ + Model('Qwen/Qwen3-4B-Instruct-2507', 'Qwen/Qwen3-4B-Instruct-2507'), + Model('Qwen/Qwen3-4B-Instruct-2507-FP8', 'Qwen/Qwen3-4B-Instruct-2507-FP8'), + ], TemplateType.qwen3_nothinking) ], TemplateType.qwen3, get_model_tokenizer_with_flash_attn, @@ -555,56 +551,17 @@ def _get_cast_dtype(self) -> torch.dtype: ]), ModelGroup([ Model('iic/Tongyi-DeepResearch-30B-A3B', 'Alibaba-NLP/Tongyi-DeepResearch-30B-A3B'), - ]) - ], - TemplateType.qwen3, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen3MoeForCausalLM'], - requires=['transformers>=4.51'], - )) - -register_model( - ModelMeta( - LLMModelType.qwen3_thinking, - [ - ModelGroup([ - Model('Qwen/Qwen3-4B-Thinking-2507', 'Qwen/Qwen3-4B-Thinking-2507'), - Model('Qwen/Qwen3-4B-Thinking-2507-FP8', 'Qwen/Qwen3-4B-Thinking-2507-FP8'), - ]), - ], - TemplateType.qwen3_thinking, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen3ForCausalLM'], - requires=['transformers>=4.51'], - )) - -register_model( - ModelMeta( - LLMModelType.qwen3_nothinking, - [ - ModelGroup([ - Model('Qwen/Qwen3-30B-A3B-Instruct-2507', 'Qwen/Qwen3-30B-A3B-Instruct-2507'), - Model('Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', 'Qwen/Qwen3-30B-A3B-Instruct-2507-FP8'), - Model('Qwen/Qwen3-235B-A22B-Instruct-2507', 'Qwen/Qwen3-235B-A22B-Instruct-2507'), - Model('Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', 'Qwen/Qwen3-235B-A22B-Instruct-2507-FP8'), - # awq - Model('swift/Qwen3-235B-A22B-Instruct-2507-AWQ'), ]), - ModelGroup([ - Model('Qwen/Qwen3-4B-Instruct-2507', 'Qwen/Qwen3-4B-Instruct-2507'), - Model('Qwen/Qwen3-4B-Instruct-2507-FP8', 'Qwen/Qwen3-4B-Instruct-2507-FP8'), - ]) - ], - TemplateType.qwen3_nothinking, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM'], - requires=['transformers>=4.51'], - )) - -register_model( - ModelMeta( - LLMModelType.qwen3_coder, - [ + ModelGroup( + [ + Model('Qwen/Qwen3-30B-A3B-Instruct-2507', 'Qwen/Qwen3-30B-A3B-Instruct-2507'), + Model('Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', 'Qwen/Qwen3-30B-A3B-Instruct-2507-FP8'), + Model('Qwen/Qwen3-235B-A22B-Instruct-2507', 'Qwen/Qwen3-235B-A22B-Instruct-2507'), + Model('Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', 'Qwen/Qwen3-235B-A22B-Instruct-2507-FP8'), + # awq + Model('swift/Qwen3-235B-A22B-Instruct-2507-AWQ'), + ], + TemplateType.qwen3_nothinking), ModelGroup([ Model('Qwen/Qwen3-Coder-30B-A3B-Instruct', 'Qwen/Qwen3-Coder-30B-A3B-Instruct'), Model('Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8', 'Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8'), @@ -612,9 +569,20 @@ def _get_cast_dtype(self) -> torch.dtype: Model('Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8', 'Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8'), Model('swift/Qwen3-Coder-480B-A35B-Instruct-AWQ'), ], + TemplateType.qwen3_coder, tags=['coding']), + ModelGroup( + [ + Model('Qwen/Qwen3-30B-A3B-Thinking-2507', 'Qwen/Qwen3-30B-A3B-Thinking-2507'), + Model('Qwen/Qwen3-30B-A3B-Thinking-2507-FP8', 'Qwen/Qwen3-30B-A3B-Thinking-2507-FP8'), + Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'), + Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'), + # awq + Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ'), + ], + TemplateType.qwen3_thinking), ], - TemplateType.qwen3_coder, + TemplateType.qwen3, get_model_tokenizer_with_flash_attn, architectures=['Qwen3MoeForCausalLM'], requires=['transformers>=4.51'], @@ -622,47 +590,21 @@ def _get_cast_dtype(self) -> torch.dtype: register_model( ModelMeta( - LLMModelType.qwen3_moe_thinking, + LLMModelType.qwen3_next, [ ModelGroup([ - Model('Qwen/Qwen3-30B-A3B-Thinking-2507', 'Qwen/Qwen3-30B-A3B-Thinking-2507'), - Model('Qwen/Qwen3-30B-A3B-Thinking-2507-FP8', 'Qwen/Qwen3-30B-A3B-Thinking-2507-FP8'), - Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'), - Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'), - # awq - Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ'), + Model('Qwen/Qwen3-Next-80B-A3B-Instruct'), + Model('Qwen/Qwen3-Next-80B-A3B-Instruct-FP8'), ]), + ModelGroup([ + Model('Qwen/Qwen3-Next-80B-A3B-Thinking'), + Model('Qwen/Qwen3-Next-80B-A3B-Thinking-FP8'), + ], TemplateType.qwen3_thinking) ], - TemplateType.qwen3_thinking, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen3MoeForCausalLM'], - requires=['transformers>=4.51'], - )) - -register_model( - ModelMeta( - LLMModelType.qwen3_next, - [ModelGroup([ - Model('Qwen/Qwen3-Next-80B-A3B-Instruct'), - Model('Qwen/Qwen3-Next-80B-A3B-Instruct-FP8'), - ])], TemplateType.qwen3_nothinking, get_model_tokenizer_with_flash_attn, architectures=['Qwen3NextForCausalLM'], - requires=['transformers>=4.57.0.dev'], - )) - -register_model( - ModelMeta( - LLMModelType.qwen3_next_thinking, - [ModelGroup([ - Model('Qwen/Qwen3-Next-80B-A3B-Thinking'), - Model('Qwen/Qwen3-Next-80B-A3B-Thinking-FP8'), - ])], - TemplateType.qwen3_thinking, - get_model_tokenizer_with_flash_attn, - architectures=['Qwen3NextForCausalLM'], - requires=['transformers>=4.57.0.dev'], + requires=['transformers>=4.57'], )) @@ -773,22 +715,11 @@ def get_model_tokenizer_qwen2_vl(*args, **kwargs): ModelGroup([ Model('allenai/olmOCR-7B-0225-preview', 'allenai/olmOCR-7B-0225-preview'), ]), - ], - TemplateType.qwen2_vl, - get_model_tokenizer_qwen2_vl, - model_arch=ModelArch.qwen2_vl, - architectures=['Qwen2VLForConditionalGeneration'], - requires=['transformers>=4.45', 'qwen_vl_utils>=0.0.6', 'decord'], - tags=['vision', 'video'])) - -register_model( - ModelMeta( - MLLMModelType.qvq, [ ModelGroup([ Model('Qwen/QVQ-72B-Preview', 'Qwen/QVQ-72B-Preview'), - ]), + ], TemplateType.qvq), ], - TemplateType.qvq, + TemplateType.qwen2_vl, get_model_tokenizer_qwen2_vl, model_arch=ModelArch.qwen2_vl, architectures=['Qwen2VLForConditionalGeneration'], @@ -855,7 +786,7 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs): get_model_tokenizer_qwen3_vl, model_arch=ModelArch.qwen3_vl, architectures=['Qwen3VLForConditionalGeneration'], - requires=['transformers>=4.57.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'], + requires=['transformers>=4.57', 'qwen_vl_utils>=0.0.14', 'decord'], tags=['vision', 'video'])) @@ -886,7 +817,7 @@ def get_model_tokenizer_qwen3_moe_vl(model_dir, *args, **kwargs): get_model_tokenizer_qwen3_moe_vl, model_arch=ModelArch.qwen3_vl, architectures=['Qwen3VLMoeForConditionalGeneration'], - requires=['transformers>=4.57.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'], + requires=['transformers>=4.57', 'qwen_vl_utils>=0.0.14', 'decord'], tags=['vision', 'video'])) register_model( @@ -1035,37 +966,6 @@ def get_model_tokenizer_qwen2_audio(*args, **kwargs): tags=['audio'], )) -register_model( - ModelMeta( - LLMModelType.marco_o1, [ModelGroup([Model('AIDC-AI/Marco-o1', 'AIDC-AI/Marco-o1')])], - TemplateType.marco_o1, - get_model_tokenizer_with_flash_attn, - model_arch=ModelArch.llama, - architectures=['Qwen2ForCausalLM'], - requires=['transformers>=4.37'])) - -register_model( - ModelMeta( - LLMModelType.qwq_preview, [ModelGroup([Model('Qwen/QwQ-32B-Preview', 'Qwen/QwQ-32B-Preview')])], - TemplateType.qwq_preview, - get_model_tokenizer_with_flash_attn, - model_arch=ModelArch.llama, - architectures=['Qwen2ForCausalLM'], - requires=['transformers>=4.37'])) - -register_model( - ModelMeta( - LLMModelType.qwq, - [ModelGroup([ - Model('Qwen/QwQ-32B', 'Qwen/QwQ-32B'), - Model('Qwen/QwQ-32B-AWQ', 'Qwen/QwQ-32B-AWQ'), - ])], - TemplateType.qwq, - get_model_tokenizer_with_flash_attn, - model_arch=ModelArch.llama, - architectures=['Qwen2ForCausalLM'], - requires=['transformers>=4.37'])) - def get_model_tokenizer_ovis(*args, **kwargs): kwargs['attn_impl_keys'] = ['llm_attn_implementation'] diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py index 90765e6652..af0b52a488 100644 --- a/swift/llm/model/register.py +++ b/swift/llm/model/register.py @@ -47,6 +47,7 @@ class ModelGroup: models: List[Model] # Higher priority. If set to None, the attributes of the ModelMeta will be used. + template: Optional[str] = None ignore_patterns: Optional[List[str]] = None requires: Optional[List[str]] = None tags: List[str] = field(default_factory=list)