fix ppo zero3 (#4263)

Jintao-Huang · Jintao-Huang · commit e20a65aaf10f · 2025-05-18T16:59:06.000+08:00
diff --git a/examples/train/multimodal/rlhf/dpo/full.sh b/examples/train/multimodal/rlhf/dpo/full.sh
@@ -0,0 +1,30 @@
+# 4 * 50GiB
+nproc_per_node=4
+
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+NPROC_PER_NODE=$nproc_per_node \
+MAX_PIXELS=1003520 \
+swift rlhf \
+    --rlhf_type dpo \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
+    --dataset 'swift/RLAIF-V-Dataset#20000' \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-5 \
+    --freeze_vit true \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --deepspeed zero3 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --save_only_model true
diff --git a/examples/train/multimodal/rlhf/dpo/lora.sh b/examples/train/multimodal/rlhf/dpo/lora.sh
@@ -1,4 +1,4 @@
-# 4*50GiB
+# 4 * 50GiB
 # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 # --rlhf_type cpo/orpo/simpo/rm are also supported
 nproc_per_node=2
diff --git a/examples/train/rlhf/ppo/full.sh b/examples/train/rlhf/ppo/full.sh
@@ -0,0 +1,33 @@
+# 8 * 65 GiB
+# Currently, it only supports the case where the model and reward_model use the same template/tokenizer.
+# Currently, multimodal model PPO is not supported.
+nproc_per_node=8
+
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=$nproc_per_node \
+swift rlhf \
+    --rlhf_type ppo \
+    --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
+    --reward_model 'AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2' \
+    --train_type full \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' 'AI-ModelScope/alpaca-gpt4-data-en#20000' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-6 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero3 \
+    --response_length 512 \
+    --temperature 0.7 \
+    --dataset_num_proc 4 \
+    --save_only_model true
diff --git a/examples/train/rlhf/ppo/lora.sh b/examples/train/rlhf/ppo/lora.sh
@@ -1,7 +1,9 @@
+# 4 * 50GiB
 # Currently, it only supports the case where the model and reward_model use the same template/tokenizer.
 # Currently, multimodal model PPO is not supported.
 nproc_per_node=4
 
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=$nproc_per_node \
 swift rlhf \
@@ -30,4 +32,5 @@ swift rlhf \
     --deepspeed zero2 \
     --response_length 512 \
     --temperature 0.7 \
-    --dataset_num_proc 4
+    --dataset_num_proc 4 \
+    --save_only_model true
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -170,7 +170,7 @@ def _load_optimizer_and_scheduler(self, *args, **kwargs):
     def _save_model(self, output_dir: Optional[str] = None, state_dict=None):
         # model
         supported_classes = (SwiftModel, PreTrainedModel, PeftModel)
-        supported_names = ('SentenceTransformer')
+        supported_names = ('SentenceTransformer', )
         if AutoModelForCausalLMWithValueHead is not None:
             supported_classes = supported_classes + (AutoModelForCausalLMWithValueHead, )
         save_safetensors = self.args.save_safetensors
diff --git a/swift/trainers/rlhf_trainer/ppo_trainer.py b/swift/trainers/rlhf_trainer/ppo_trainer.py
@@ -1,11 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import inspect
 from contextlib import contextmanager
+from typing import Optional
 
 import transformers
 from packaging import version
 from torch.utils.data import DataLoader
-from transformers import PreTrainedModel
+from transformers import PreTrainedModel, Trainer
 from trl import PPOTrainer as HFPPOTrainer
 
 from swift.utils import patch_getattr
@@ -63,3 +64,21 @@ def _save_checkpoint(self, *args, **kwargs):
             trial = kwargs.get('trial')
             self._determine_best_metric(metrics=metrics, trial=trial)
         return super()._save_checkpoint(*args, **kwargs)
+
+    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        # https://github.com/huggingface/trl/issues/2122
+        backup_model = self.model
+        self.model = self.model.policy  # save only the policy
+
+        Trainer.save_model(self, output_dir, _internal_call)
+
+        self.model = backup_model
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if self.is_deepspeed_enabled:
+            state_dict = {
+                name.removeprefix('policy.'): param
+                for name, param in state_dict.items() if name.startswith('policy.')
+            }
+
+        super()._save(output_dir, state_dict)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# 4*50GiB`
	`1`	`+# 4 * 50GiB`
`2`	`2`	# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
`3`	`3`	`# --rlhf_type cpo/orpo/simpo/rm are also supported`
`4`	`4`	`nproc_per_node=2`