Add batch level std calculation (agentscope-ai#311)

garyzhang99 · web-flow · commit e29bd29086bb · 2025-10-15T10:13:31.000+08:00
diff --git a/tests/algorithm/advantage_fn_test.py b/tests/algorithm/advantage_fn_test.py
@@ -146,6 +146,55 @@ def test_grpo_correct_bias(self):
             places=6,
         )
 
+    def test_batch_level_std_grpo(self):
+        advantage_fn_cls = ADVANTAGE_FN.get("grpo")
+        self.assertIsNotNone(advantage_fn_cls)
+        advantage_fn = advantage_fn_cls(epsilon=1e-7, std_cal_level="batch")
+
+        rewards_task0 = [1.0, 2.0, 3.0]
+        rewards_task1 = [11.0, 12.0, 13.0]
+
+        exps = [
+            Experience(
+                eid=EID(batch=0, task=0, run=i),
+                tokens=torch.zeros(5),
+                prompt_length=2,
+                reward=rewards_task0[i],
+                action_mask=torch.tensor([0, 0, 1, 1, 1], dtype=torch.float32),
+            )
+            for i in range(len(rewards_task0))
+        ]
+        exps.extend(
+            [
+                Experience(
+                    eid=EID(batch=0, task=1, run=i),
+                    tokens=torch.zeros(5),
+                    prompt_length=2,
+                    reward=rewards_task1[i],
+                    action_mask=torch.tensor([0, 0, 1, 1, 1], dtype=torch.float32),
+                )
+                for i in range(len(rewards_task1))
+            ]
+        )
+
+        all_rewards = torch.tensor(rewards_task0 + rewards_task1, dtype=torch.float32)
+        batch_std = torch.std(all_rewards)
+
+        group0_mean = torch.mean(torch.tensor(rewards_task0, dtype=torch.float32))
+
+        processed_exps, metrics = advantage_fn(exps)
+        self.assertIn("group_advantages/reward_mean/mean", metrics)
+        self.assertIn("group_advantages/reward_std/mean", metrics)
+        self.assertEqual(len(processed_exps), len(rewards_task0) + len(rewards_task1))
+
+        target_exp = next(exp for exp in processed_exps if exp.eid.task == 0 and exp.eid.run == 1)
+        expected_advantage_value = (target_exp.reward - group0_mean) / (
+            batch_std + advantage_fn.epsilon
+        )
+        expected_advantages = expected_advantage_value * target_exp.action_mask
+        self.assertTrue(torch.allclose(target_exp.advantages, expected_advantages, atol=1e-6))
+        self.assertTrue(torch.allclose(target_exp.returns, expected_advantages, atol=1e-6))
+
     def test_duplicate_grpo(self):
         advantage_fn_cls = ADVANTAGE_FN.get("grpo")
         self.assertIsNotNone(advantage_fn_cls)
@@ -222,3 +271,58 @@ def test_step_wise_grpo_advantage(self):
             metrics["group_advantages/reward_std/mean"]
             == torch.std(torch.tensor([i for i in range(repeat_times)], dtype=torch.float32)).item()
         )
+
+    def test_batch_level_step_wise_grpo_advantage(self):
+        advantage_fn_cls = ADVANTAGE_FN.get("step_wise_grpo")
+        self.assertIsNotNone(advantage_fn_cls)
+        advantage_fn = advantage_fn_cls(epsilon=1e-7, std_cal_level="batch")
+
+        task_num = 2
+        repeat_times = 3  # runs
+        step_num = 4
+
+        # Let reward vary by task, run, and step to make the test meaningful
+        # reward = task*10 + run*1 + step*0.1
+        exps = []
+        all_rewards_list = []
+        for j in range(task_num):  # task
+            for i in range(repeat_times):  # run
+                reward_val = float(j * 10 + i * 1)
+                all_rewards_list.append(reward_val)
+                for k in range(step_num):  # step
+                    exps.append(
+                        Experience(
+                            eid=EID(batch=0, task=j, run=i, step=k),
+                            tokens=torch.zeros(5),
+                            prompt_length=2,
+                            reward=reward_val,
+                            action_mask=torch.tensor([0, 0, 1, 1, 1], dtype=torch.float32),
+                        )
+                    )
+
+        all_rewards = torch.tensor(all_rewards_list, dtype=torch.float32)
+        batch_std = torch.std(all_rewards)
+
+        # For a specific group (e.g., task = 9)
+        group_rewards = [
+            float(0 * 10 + 1 * k) for k in range(repeat_times)
+        ]  # [0.0, 1.0, 2.0] for task = 0
+        group_mean = torch.mean(torch.tensor(group_rewards, dtype=torch.float32))
+
+        processed_exps, metrics = advantage_fn(exps)
+        self.assertIn("group_advantages/reward_mean/mean", metrics)
+        self.assertIn("group_advantages/reward_std/mean", metrics)
+        self.assertEqual(len(processed_exps), task_num * repeat_times * step_num)
+
+        # Pick a target experience: task=0, run=1, step=2. Reward is 1.2
+        target_exp = next(
+            exp
+            for exp in processed_exps
+            if exp.eid.task == 0 and exp.eid.run == 1 and exp.eid.step == 0
+        )
+        expected_advantage_value = (target_exp.reward - group_mean) / (
+            batch_std + advantage_fn.epsilon
+        )
+        expected_advantages = expected_advantage_value * target_exp.action_mask
+        self.assertTrue(torch.allclose(target_exp.advantages, expected_advantages, atol=1e-6))
+        self.assertTrue(torch.allclose(target_exp.returns, expected_advantages, atol=1e-6))
diff --git a/trinity/algorithm/advantage_fn/grpo_advantage.py b/trinity/algorithm/advantage_fn/grpo_advantage.py
@@ -101,6 +101,7 @@ def __init__(
         std_threshold: Optional[float] = None,
         duplicate_experiences: bool = False,
         rank_penalty: Optional[float] = None,
+        std_cal_level: str = "group",  # "group" or "batch"
     ) -> None:
         """Initialize the GRPO advantage function.
 
@@ -112,17 +113,28 @@ def __init__(
                 count. Only used when `std_threshold` is not None (https://hkunlp.github.io/blog/2025/Polaris).
             rank_penalty (Optional[float]): A penalty applied to the rank of rewards to correct for bias
                 (https://arxiv.org/pdf/2506.02355).
+            std_cal_level (str): The scope for calculating the reward standard deviation for normalization.
+                Can be 'group' (default, std is calculated per group) or 'batch' (std is calculated
+                across the entire batch). The mean is always calculated per group.
+                Calculating the mean at the local (group) level and the standard deviation at the global (batch)
+                level enables more robust reward shaping(https://arxiv.org/pdf/2508.08221v1).
         """
         self.epsilon = epsilon
         self.std_threshold = std_threshold
         self.duplicate_experiences = duplicate_experiences
         self.rank_penalty = rank_penalty
+        self.std_cal_level = std_cal_level
+        if self.std_cal_level not in ["group", "batch"]:
+            raise ValueError("std_cal_level must be either 'group' or 'batch'")
 
     def group_experiences(self, exps):
         return group_by(exps, id_type="task")
 
     def calculate_group_advantage(
-        self, group_id: str, exps: List[Experience]
+        self,
+        group_id: str,
+        exps: List[Experience],
+        precomputed_std: Optional[torch.Tensor] = None,
     ) -> Tuple[List[Experience], Dict]:
         metrics = {}
         with torch.no_grad():
@@ -155,7 +167,10 @@ def calculate_group_advantage(
                     exps.clear()
 
             for exp in exps:
-                score = (exp.reward - group_reward_mean) / (group_reward_std + self.epsilon)
+                if self.std_cal_level == "batch" and precomputed_std is not None:
+                    score = (exp.reward - group_reward_mean) / (precomputed_std + self.epsilon)
+                else:
+                    score = (exp.reward - group_reward_mean) / (group_reward_std + self.epsilon)
                 exp.advantages = score * exp.action_mask
                 exp.returns = exp.advantages.clone()
 
@@ -185,8 +200,19 @@ def _duplicate_experiences(self, exp_groups: Dict[str, List[Experience]]) -> Lis
     def process(self, exps):
         exp_groups = self.group_experiences(exps)
         metric_list = []
+        precomputed_std = None
+        if self.std_cal_level == "batch":
+            all_rewards = torch.tensor(
+                [exp.reward for exp in exps], dtype=torch.float32
+            )  # All rewards in the batch
+            if len(all_rewards) <= 1:
+                precomputed_std = torch.tensor(1.0)
+            else:
+                precomputed_std = torch.std(all_rewards)
         for group_id, group_exps in exp_groups.items():
-            group_exps, group_metrics = self.calculate_group_advantage(group_id, group_exps)
+            group_exps, group_metrics = self.calculate_group_advantage(
+                group_id, group_exps, precomputed_std=precomputed_std
+            )
             metric_list.append(group_metrics)
         try:
             # TODO: sum skipped count
@@ -201,4 +227,10 @@ def process(self, exps):
 
     @classmethod
     def default_args(cls) -> dict:
-        return {"epsilon": 1e-6}
+        return {
+            "epsilon": 1e-6,
+            "std_threshold": None,
+            "duplicate_experiences": False,
+            "rank_penalty": None,
+            "std_cal_level": "group",
+        }
diff --git a/trinity/algorithm/advantage_fn/multi_step_grpo_advantage.py b/trinity/algorithm/advantage_fn/multi_step_grpo_advantage.py
@@ -1,6 +1,6 @@
 """GRPO advantage computation for multi-step scenarios
 """
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
@@ -21,13 +21,29 @@ def __init__(
         self,
         epsilon: float = 1e-6,
         enable_step_norm: bool = False,
+        std_cal_level: str = "group",  # 'group' (task-level) or 'batch'
         **kwargs,
     ) -> None:
+        """Initialize the Step-wise GRPO advantage function.
+
+        Args:
+            epsilon (float): A small value to avoid division by zero.
+            enable_step_norm (bool): If True, normalize advantages by trajectory length.
+            std_cal_level (str): The scope for calculating reward standard deviation.
+                'group' (default): Std is calculated per task group.
+                'batch': Std is calculated across all last-step rewards in the entire batch.
+                The mean is always calculated per task group.
+        """
         self.epsilon = epsilon
         self.enable_step_norm = enable_step_norm
+        self.std_cal_level = std_cal_level
+        if self.std_cal_level not in ["group", "batch"]:
+            raise ValueError("std_cal_level must be either 'group' or 'batch'")
 
     def calculate_last_step_advantage(
-        self, exps: Dict[str, Experience]
+        self,
+        exps: Dict[str, Experience],
+        precomputed_std: Optional[torch.Tensor] = None,
     ) -> Tuple[Dict[str, float], Dict[str, float]]:
         """Calculate group advantage for a given group of experiences.
 
@@ -48,7 +64,10 @@ def calculate_last_step_advantage(
                 group_reward_std = torch.std(rewards)
             scores = {}
             for rid, exp in exps.items():
-                score = (exp.reward - group_reward_mean) / (group_reward_std + self.epsilon)
+                if self.std_cal_level == "batch" and precomputed_std is not None:
+                    score = (exp.reward - group_reward_mean) / (precomputed_std + self.epsilon)
+                else:
+                    score = (exp.reward - group_reward_mean) / (group_reward_std + self.epsilon)
                 scores[rid] = score.item()
             metrics = {
                 "reward_mean": group_reward_mean.item(),
@@ -85,14 +104,36 @@ def process(self, exps: List[Experience]) -> Tuple[List[Experience], Dict]:
         metric_list = []
         # Step 1: split the experiences into sub-groups by task
         task_exps = group_by(exps, "task")
+
+        # --- Pre-computation step for batch-level standard deviation ---
+        precomputed_std = None
+        if self.std_cal_level == "batch":
+            all_laststep_rewards = []
+            for task_exp in task_exps.values():
+                # First, group all experiences by run to find the last step of each run
+                task_run_exps = group_by(task_exp, "run")
+                # Collect rewards from the last step of every run in the entire batch
+                last_step_rewards = [
+                    run_steps[-1].reward for run_steps in task_run_exps.values() if run_steps
+                ]
+                all_laststep_rewards.extend(last_step_rewards)
+
+            if len(all_laststep_rewards) <= 1:
+                precomputed_std = torch.tensor(1.0)
+            else:
+                precomputed_std = torch.std(torch.tensor(all_laststep_rewards, dtype=torch.float32))
+        # --- End of pre-computation ---
+
         # Step 2: further split each task's experiences into sub-groups by run
         result_exps = []
         for task_exp in task_exps.values():
             run_exps = group_by(task_exp, "run")
 
             # Step3: extract the last experience (last step) from each run and calculate scores
             last_step_exps = {run_id: step_exps[-1] for run_id, step_exps in run_exps.items()}
-            scores, metrics = self.calculate_last_step_advantage(last_step_exps)
+            scores, metrics = self.calculate_last_step_advantage(
+                last_step_exps, precomputed_std=precomputed_std
+            )
             metric_list.append(metrics)
 
             # Step 4: broadcast the advantages to all previous steps