Merge branch 'grpo-latest-rebase-main' of https://github.com/hpcaitech/ColossalAI into grpo-latest-rebase-main

YeAnbang · YeAnbang · commit fe1f4295740f · 2025-08-15T10:16:49.000+08:00
diff --git a/applications/ColossalChat/coati/experience_maker/naive.py b/applications/ColossalChat/coati/experience_maker/naive.py
@@ -119,7 +119,9 @@ def make_experience(
         generate_kwargs["stop_token_ids"] = stop_token_ids
         # Hack: manually initialize cache_position to address transformer version conflict
         if generate_kwargs.get("cache_position", None) is None and generate_kwargs.get("use_cache", False) is True:
-            generate_kwargs["cache_position"] = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+            generate_kwargs["cache_position"] = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
         torch.manual_seed(41)  # for tp, gurantee the same input for reward model
 
         if self.use_grpo and self.num_generation > 1:
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
@@ -193,12 +193,14 @@ def _train(self, epoch: int):
             loss_mean = all_reduce_mean(tensor=loss)
             chosen_reward_mean = chosen_rewards.mean()
             chosen_rewards_list = [
-                torch.tensor(0, dtype=chosen_reward_mean.dtype, device=loss.device) for _ in range(dist.get_world_size())
+                torch.tensor(0, dtype=chosen_reward_mean.dtype, device=loss.device)
+                for _ in range(dist.get_world_size())
             ]
             dist.all_gather(chosen_rewards_list, chosen_reward_mean)
             rejected_reward_mean = rejected_rewards.mean()
             rejected_rewards_list = [
-                torch.tensor(0, dtype=rejected_reward_mean.dtype, device=loss.device) for _ in range(dist.get_world_size())
+                torch.tensor(0, dtype=rejected_reward_mean.dtype, device=loss.device)
+                for _ in range(dist.get_world_size())
             ]
             dist.all_gather(rejected_rewards_list, rejected_reward_mean)
             chosen_rewards_list = [i for i in chosen_rewards_list if not i.isnan()]
diff --git a/applications/ColossalChat/examples/training_scripts/train_grpo.py b/applications/ColossalChat/examples/training_scripts/train_grpo.py
@@ -89,9 +89,7 @@ def train(args):
             actor = AutoModelForCausalLM.from_pretrained(args.pretrain, trust_remote_code=True)
             if args.rm_pretrain:
                 reward_model = RewardModel(args.rm_pretrain, trust_remote_code=True)
-            ref_model = AutoModelForCausalLM.from_pretrained(
-                args.pretrain, trust_remote_code=True
-            )
+            ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, trust_remote_code=True)
 
         if args.lora_config is not None:
             actor = convert_to_lora_module(actor, lora_config=lora_config)
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
@@ -102,9 +102,7 @@ def train(args):
             coordinator.print_on_master(msg="Flash-attention enabled successfully")
         else:
             actor = AutoModelForCausalLM.from_pretrained(args.pretrain, trust_remote_code=True)
-            ref_model = AutoModelForCausalLM.from_pretrained(
-                args.pretrain, trust_remote_code=True
-            )
+            ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, trust_remote_code=True)
             if not args.no_neural_reward_model:
                 reward_model = RewardModel(args.rm_pretrain, trust_remote_code=True)
             critic = Critic(args.rm_pretrain)
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
@@ -631,7 +631,7 @@ for lora_rank in ${LORA_RANK[@]}; do
         done
     done
 done
- 
+
 
 echo "[Test]: testing ORPO ..."
 

-Original file line number
+Diff line change
         done
     done
 done
+-
++
 echo "[Test]: testing ORPO ..."