Merge branch 'grpo-latest' into grpo-latest-npu

duanjunwen · duanjunwen · commit a9cf3aaf8ab8 · 2025-03-13T16:38:17.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
@@ -74,6 +74,8 @@ def setup(self) -> None:
         )
         if plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
             plugin_config["microbatch_size"] = self.microbatch_size
+        if self.plugin_config.get("tp_size", 1) > 1:
+            plugin_config["parallel_output"] = False
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -56,6 +56,10 @@ def __init__(
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
+        self.accum_format_reward = torch.zeros(1, device=self.device)
+        self.accum_acc_reward = torch.zeros(1, device=self.device)
+        self.accum_advantages = torch.zeros(1, device=self.device)
+        self.accum_response_length = torch.zeros(1, device=self.device)
         self.accum_count = 0
 
         # Reference model is initialized from policy model.
@@ -80,7 +84,7 @@ def __init__(
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         if use_wandb and self.rank == 0:
-            self.wandb_run = wandb.init(project="GRPO-Test", sync_tensorboard=True)
+            self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True)
 
     def setup(self):
         super().setup()
@@ -106,6 +110,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
         action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
+        response_length = torch.sum(action_mask, dim=1).to(torch.float32)
 
         need_update = (step_idx + 1) % self.num_microbatches == 0
 
@@ -133,9 +138,14 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             )
             kl = torch.sum(per_token_kl * action_mask, dim=-1) / torch.sum(action_mask, dim=-1)
 
-            reward = self.reward_model(
+            reward_group = self.reward_model(
                 data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
             )
+
+            reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
+            format_reward = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
+            acc_reward = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
+
             # [batch_size, num_generations]
             group_reward = reward.view(-1, self.num_generations)
 
@@ -159,9 +169,18 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             loss = all_reduce_mean(loss, self.plugin)
             reward = all_reduce_mean(reward.mean(), self.plugin)
             kl = all_reduce_mean(kl.mean(), self.plugin)
+            format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
+            acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
+            advantages = all_reduce_mean(advantages.mean(), self.plugin)
+            response_length = all_reduce_mean(response_length.mean(), self.plugin)
+            # Calculate accumulate value.
             self.accum_loss.add_(loss.data)
             self.accum_reward.add_(reward.data)
             self.accum_kl.add_(kl.data)
+            self.accum_format_reward.add_(format_reward.data)
+            self.accum_acc_reward.add_(acc_reward.data)
+            self.accum_advantages.add_(advantages.data)
+            self.accum_response_length.add_(response_length.data)
             self.accum_count += 1
         if need_update:
             self.optimizer.step()
@@ -171,21 +190,38 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 print(
                     "Loss:",
                     self.accum_loss.item() / self.accum_count,
-                    "Reward:",
+                    "\nReward:",
                     self.accum_reward.item() / self.accum_count,
-                    "KL:",
+                    "\nFormat Reward:",
+                    self.accum_format_reward.item() / self.accum_count,
+                    "\nAcc Reward:",
+                    self.accum_acc_reward.item() / self.accum_count,
+                    "\nKL:",
                     self.accum_kl.item() / self.accum_count,
+                    "\nAdvantages:",
+                    self.accum_advantages.item() / self.accum_count,
+                    "\nResponse Length:",
+                    self.accum_response_length.item() / self.accum_count,
                 )
                 self.wandb_run.log(
                     {
                         "train/loss": self.accum_loss.item() / self.accum_count,
                         "train/reward": self.accum_reward.item() / self.accum_count,
+                        "train/format_reward": self.accum_format_reward.item() / self.accum_count,
+                        "train/acc_reward": self.accum_acc_reward.item() / self.accum_count,
                         "train/kl": self.accum_kl.item() / self.accum_count,
+                        "train/advantages": self.accum_advantages.item() / self.accum_count,
+                        "train/response_length": self.accum_response_length.item() / self.accum_count,
                     }
                 )
             self.accum_loss.zero_()
             self.accum_reward.zero_()
+            self.accum_acc_reward.zero_()
+            self.accum_format_reward.zero_()
             self.accum_kl.zero_()
+            self.accum_advantages.zero_()
+            self.accum_response_length.zero_()
+
             self.accum_count = 0
             return loss_scalar
 
diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
@@ -26,15 +26,10 @@ def forward(
     ) -> torch.Tensor:
         skip = False
         if action_mask is None:
-            ratio_ = (log_probs - old_log_probs).exp()
+            ratio = (log_probs - log_probs.detach()).exp()
         else:
-            ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
+            ratio = ((log_probs - log_probs.detach()) * action_mask).exp()
 
-        # note that if dropout is disabled (recommanded), ratio will always be 1.
-        if ratio_.mean() > self.skip_threshold:
-            skip = True
-
-        ratio = ratio_.clamp(0.0, 10.0)
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
         loss = -torch.min(surr1, surr2) + self.beta * per_token_kl
@@ -44,4 +39,4 @@ def forward(
         else:
             loss = loss.mean(dim=1)
         loss = loss.mean()
-        return loss, skip, ratio_.max()
+        return loss, skip, ratio.max()
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -4,8 +4,12 @@
 
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
+    format_score = 1.0
+    acc_score = 9.0
     tokenizer = kwargs["tokenizer"]
-    reward = torch.tensor(0.0).to(input_ids.device)
+    reward = torch.tensor(0.0)
+    format_reward = torch.tensor(0.0)
+    acc_reward = torch.tensor(0.0)
     s, e = response_idx[0], response_idx[1]
     if gt_answer is None:
         return reward
@@ -15,13 +19,21 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
-    if not format_valid:
-        return reward
-    else:
-        reward += 1.0
-        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 2.0
-        return reward
+
+    # Check format accuracy
+    if format_valid:
+        format_reward += format_score
+        reward += format_score
+
+    # Check answer accuracy
+    if (
+        final_answer is not None
+        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
+    ):
+        acc_reward += acc_score
+        reward += acc_score
+
+    return torch.tensor([reward, format_reward, acc_reward]).to(input_ids.device)
 
 
 def gsm8k_reward_fn(input_ids, **kwargs):
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -21,7 +21,7 @@ def __call__(
         # Get batch size
         bs = input_ids.size(0)
         # Initialize reward
-        rewards = torch.zeros(bs, device=input_ids.device)
+        rewards = torch.zeros((bs, 3), device=input_ids.device)
 
         # Loop through reward functions
         for reward_fn in self.reward_fns:
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
@@ -11,6 +11,7 @@
     Linear1D_Row,
     LinearWithGradAccum,
     PaddingEmbedding,
+    PaddingLMHead,
     RMSNorm,
     VocabParallelEmbedding1D,
     VocabParallelLMHead1D,
@@ -449,13 +450,18 @@ def module_policy(self):
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
                             suffix="lm_head",
-                            target_module=LinearWithGradAccum,
-                            kwargs=dict(
-                                gather_output=not self.shard_config.parallel_output,
-                                fp8_communication=self.shard_config.fp8_communication,
-                                use_zbv=use_zbv,
-                            ),
-                        )
+                            target_module=PaddingLMHead,
+                            kwargs=dict(fp8_communication=self.shard_config.fp8_communication, use_zbv=use_zbv),
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="lm_head",
+                            target_module=VocabParallelLMHead1D,
+                            kwargs={
+                                "gather_output": not self.shard_config.parallel_output,
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            },
+                        ),
                     ],
                     method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,8 @@ def setup(self) -> None:`
`74`	`74`	`)`
`75`	`75`	`if plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:`
`76`	`76`	`plugin_config["microbatch_size"] = self.microbatch_size`
	`77`	`+ if self.plugin_config.get("tp_size", 1) > 1:`
	`78`	`+ plugin_config["parallel_output"] = False`
`77`	`79`	`plugin_config.update(self.plugin_config)`
`78`	`80`	`self.plugin = HybridParallelPlugin(**plugin_config)`
`79`	`81`	`self.booster = Booster(plugin=self.plugin)`