add entropy

YeAnbang · YeAnbang · commit ddda79c36f38 · 2025-11-06T10:57:32.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -346,6 +346,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                         data_policy_forward["reference_action_log_probs"] = reference_action_log_probs
 
                     kl = []
+                    policy_model_logits = torch.empty_like(input_ids_forward_micro_batch, device=self.device)
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
@@ -425,6 +426,20 @@ def _criterion(outputs, inputs):
                             kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin).data
                             mean_kl.append(kl)
                         mean_loss.append(all_reduce_mean(loss, self.plugin).data)
+                        mini_batch_entropies.append(
+                            all_reduce_mean(
+                                (
+                                    (
+                                        (
+                                            entropy_from_logits(policy_model_logits[:, -num_action:])
+                                            * action_mask_forward_micro_batch
+                                        ).sum(-1)
+                                    )
+                                    / action_mask_forward_micro_batch.sum(-1)
+                                ).detach(),
+                                self.plugin,
+                            )
+                        )
                 else:
                     policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,