Merge branch 'main' into release/3.3

Jintao-Huang · Jintao-Huang · commit f57790cdd00e · 2025-04-10T23:06:30.000+08:00
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ Running Environment:
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.8.3       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1      | 0.7.3/0.8.3       | Inference/Deployment/Evaluation           |
 | lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       |  | Evaluation |
 
diff --git a/README_CN.md b/README_CN.md
@@ -120,7 +120,7 @@ pip install -e .
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.8.3 |推理/部署/评测|
+| vllm | >=0.5.1      | 0.7.3/0.8.3 |推理/部署/评测|
 | lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -69,7 +69,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.8.3 |推理/部署/评测|
+| vllm | >=0.5.1      | 0.7.3/0.8.3 |推理/部署/评测|
 | lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
@@ -133,6 +133,7 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
+  - 注意：若该参数设置为True，训练时grad_norm一直为0，请安装`vllm==0.7.3`
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
 - multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
 - mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_batch 需要能够被 mini_batch_size 整除
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -413,6 +413,7 @@ reward模型参数将在PPO、GRPO中使用。
 - move_model_batches: 在模型向vLLM/LMDeploy等快速推理框架移动参数时，将layers分为多少个batch. 默认为None, 代表整个模型不进行拆分，否则拆分为move_model_batches+1(非layer参数)+1(多模态部分参数)个
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
+  - 注意：若该参数设置为True，训练时grad_norm一直为0，请安装`vllm==0.7.3`
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
 - multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
 - mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_train_batch_size 需要能够被 mini_batch_size 整除
@@ -578,6 +579,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 ### qwen2_5_omni
 qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还包含以下参数：
 - USE_AUDIO_IN_VIDEO: 默认为False
+- 🔥ENABLE_AUDIO_OUTPUT: 默认为True。若使用zero3进行训练，请设置为False
 
 ### internvl, internvl_phi3
 参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -356,6 +356,7 @@
 |[deepseek-ai/DeepSeek-V3](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)|
 |[deepseek-ai/DeepSeek-V3-0324](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-0324)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324)|
 |[cognitivecomputations/DeepSeek-V3-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-awq)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-V3-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-AWQ)|
+|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-0324-AWQ)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-0324-AWQ)|
 |[deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)|
 |[deepseek-ai/DeepSeek-R1-Zero](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Zero)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-R1-Zero](https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero)|
 |[cognitivecomputations/DeepSeek-R1-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-R1-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ)|
diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -70,7 +70,7 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set
 | peft         | >=0.11,<0.16 |             |                                           |
 | trl          | >=0.13,<0.17 | 0.16      | RLHF                                      |
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.8.3       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1      | 0.7.3/0.8.3       | Inference/Deployment/Evaluation           |
 | lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       | | Evaluation |
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -424,6 +424,7 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
+  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
 - multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
 - mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size
@@ -590,6 +591,7 @@ The parameter meanings are the same as in the `qwen_vl_utils` or `qwen_omni_util
 ### qwen2_5_omni
 qwen2_5_omni not only includes the model-specific parameters of qwen2_5_vl and qwen2_audio, but also contains the following parameter:
 - USE_AUDIO_IN_VIDEO: Default is False.
+- 🔥ENABLE_AUDIO_OUTPUT: Default is True. If training with zero3, set it to False.
 
 ### internvl, internvl_phi3
 For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
@@ -136,6 +136,7 @@ Arguments
 - move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
+  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
 - multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
 - mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -356,6 +356,7 @@ The table below introduces the models integrated with ms-swift:
 |[deepseek-ai/DeepSeek-V3](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)|
 |[deepseek-ai/DeepSeek-V3-0324](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-0324)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324)|
 |[cognitivecomputations/DeepSeek-V3-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-awq)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-V3-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-AWQ)|
+|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-0324-AWQ)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-V3-0324-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-0324-AWQ)|
 |[deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)|
 |[deepseek-ai/DeepSeek-R1-Zero](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Zero)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[deepseek-ai/DeepSeek-R1-Zero](https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero)|
 |[cognitivecomputations/DeepSeek-R1-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq)|deepseek_r1|deepseek_r1|transformers>=4.39.3|&#x2718;|-|[cognitivecomputations/DeepSeek-R1-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ)|
diff --git a/examples/train/grpo/lora_qwenvl72b.sh b/examples/train/grpo/lora_qwenvl72b.sh
@@ -1,6 +1,9 @@
 # pip install math_verify # reward function
 # GPU memory: 8 * 80GiB
 
+# Note: If the grad_norm remains zero during training,
+# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
+
 MAX_PIXELS=602112 \
 WANDB_API_KEY=xxx \
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
diff --git a/examples/train/grpo/qwen2_5_omni/grpo.sh b/examples/train/grpo/qwen2_5_omni/grpo.sh
@@ -1,4 +1,6 @@
 # 4 * 50GiB
+pip uninstall transformers
+pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
 pip install math_verify trl -U
 
 MAX_PIXELS=1003520 \
diff --git a/examples/train/grpo/train_72b_4gpu.sh b/examples/train/grpo/train_72b_4gpu.sh
@@ -1,4 +1,8 @@
 # 4*80G GPU
+
+# Note: If the grad_norm remains zero during training,
+# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
+
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift rlhf \
diff --git a/examples/train/grpo/train_multi_round.sh b/examples/train/grpo/train_multi_round.sh
@@ -1,3 +1,6 @@
+# Note: If the grad_norm remains zero during training,
+# please remove the `--offload_model true` parameter, or use `vllm==0.7.3`.
+
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 NPROC_PER_NODE=8 \
 swift rlhf \
diff --git a/examples/train/multimodal/omni/infer.sh b/examples/train/multimodal/omni/infer.sh
@@ -2,6 +2,7 @@ CUDA_VISIBLE_DEVICES=0 \
 VIDEO_MAX_PIXELS=50176 \
 FPS_MAX_FRAMES=12 \
 MAX_PIXELS=1003520 \
+ENABLE_AUDIO_OUTPUT=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
diff --git a/examples/train/multimodal/omni/sft.sh b/examples/train/multimodal/omni/sft.sh
@@ -1,12 +1,16 @@
-# 4*25GB
+# 4*35GB
 # A demo for four modalities that can be run directly
+pip uninstall transformers
+pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+
 nproc_per_node=4
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=$nproc_per_node \
 VIDEO_MAX_PIXELS=50176 \
 FPS_MAX_FRAMES=12 \
 MAX_PIXELS=1003520 \
+ENABLE_AUDIO_OUTPUT=0 \
 swift sft \
     --model Qwen/Qwen2.5-Omni-7B \
     --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#2000' \
diff --git a/examples/train/packing/qwen2_5_omni.sh b/examples/train/packing/qwen2_5_omni.sh
@@ -2,6 +2,9 @@
 # Multimodal packing currently only supports qwen2_vl, qwen2_5_vl, qwen2_5_omni, internvl2_5
 # A demo for four modalities that can be run directly
 # For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
+pip uninstall transformers
+pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+
 NPROC_PER_NODE=4 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 VIDEO_MAX_PIXELS=50176 \
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -40,7 +40,7 @@ class ModelArguments:
     torch_dtype: Literal['bfloat16', 'float16', 'float32', None] = None
     # flash_attn: It will automatically convert names based on the model.
     # None: It will be automatically selected between sdpa and eager.
-    attn_impl: Optional[str] = None  # 'flash_attn', 'sdpa', 'eager'
+    attn_impl: Literal['flash_attn', 'sdpa', 'eager', None] = None
 
     num_labels: Optional[int] = None
     problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None
diff --git a/swift/llm/argument/base_args/template_args.py b/swift/llm/argument/base_args/template_args.py
@@ -33,7 +33,7 @@ class TemplateArguments:
     system: Optional[str] = None  # Override the default_system in the template.
     max_length: Optional[int] = None
 
-    truncation_strategy: Optional[Literal['delete', 'left', 'right']] = None
+    truncation_strategy: Literal['delete', 'left', 'right', None] = None
     max_pixels: Optional[int] = None
     tools_prompt: str = 'react_en'  # Override the default_tools_prompt in the template.
     norm_bbox: Literal['norm1000', 'none', None] = None
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
@@ -56,7 +56,7 @@ class GRPOArguments(GRPOArgumentsMixin):
     # multi step
     num_iterations: int = 1
 
-    truncation_strategy: Optional[Literal['delete', 'left', 'right']] = None
+    truncation_strategy: Literal['delete', 'left', 'right', None] = None
 
 
 @dataclass
diff --git a/swift/llm/model/model/deepseek.py b/swift/llm/model/model/deepseek.py
@@ -110,6 +110,7 @@ def get_model_tokenizer_deepseek_moe(model_dir: str,
             ]),
             ModelGroup([
                 Model('cognitivecomputations/DeepSeek-V3-awq', 'cognitivecomputations/DeepSeek-V3-AWQ'),
+                Model('cognitivecomputations/DeepSeek-V3-0324-AWQ', 'cognitivecomputations/DeepSeek-V3-0324-AWQ')
             ])
         ],
         TemplateType.deepseek_v2_5,
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -621,6 +621,7 @@ def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
     kwargs['tokenizer'] = processor.tokenizer
     kwargs['model_config'] = Qwen2_5OmniConfig.from_pretrained(model_dir, trust_remote_code=True)
     patch_qwen_vl_utils(vision_process)
+    kwargs['model_config'].enable_audio_output = get_env_args('ENABLE_AUDIO_OUTPUT', bool, True)
     model, _ = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
     if model:
         use_submodel_func(model, 'thinker')
diff --git a/swift/llm/model/model/telechat.py b/swift/llm/model/model/telechat.py
@@ -1,5 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from transformers import GenerationConfig
+
 from swift.llm import TemplateType
 from ..constant import LLMModelType
 from ..model_arch import ModelArch
@@ -8,9 +10,10 @@
 
 def get_model_tokenizer_telechat(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs)
-    if model is not None:
-        for k in ['bos_token_id', 'eos_token_id', 'pad_token_id', 'user_token_id', 'bot_token_id']:
-            setattr(tokenizer, k, getattr(model.generation_config, k))
+    model_dir = args[0]
+    generation_config = GenerationConfig.from_pretrained(model_dir)
+    for k in ['bos_token_id', 'eos_token_id', 'pad_token_id', 'user_token_id', 'bot_token_id']:
+        setattr(tokenizer, k, getattr(generation_config, k))
     return model, tokenizer
 
 
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py