truncate based on the sign of advantage after ratio clip

lehaoqu · lehaoqu · commit a6d0b6f5a438 · 2025-10-27T17:36:09.000+08:00
diff --git a/tests/algorithm/policy_loss_test.py b/tests/algorithm/policy_loss_test.py
@@ -30,22 +30,6 @@ def setUp(self):
             }
         )
 
-    def test_dcppo_policy_loss(self):
-        policy_loss_fn_cls = POLICY_LOSS_FN.get("dcppo")
-        policy_loss_fn_args = policy_loss_fn_cls.default_args()
-        policy_loss_fn = policy_loss_fn_cls(**policy_loss_fn_args)
-        loss, metrics = policy_loss_fn(logprob=self.logprob, **self.input_data.batch)
-        dcppo_loss = torch.tensor(0.26889559626579285)
-        pg_clipfrac = torch.tensor(0.3541666567325592)
-        pg_clipfrac_lower = torch.tensor(0.0625)
-        ppo_kl = torch.tensor(-0.21663446724414825)
-        self.assertTrue(torch.allclose(loss, dcppo_loss))
-        self.assertTrue(torch.allclose(torch.tensor(metrics["pg_clipfrac"]), pg_clipfrac))
-        self.assertTrue(torch.allclose(torch.tensor(metrics["pg_clipfrac_lower"]), pg_clipfrac_lower))
-        self.assertTrue(torch.allclose(torch.tensor(metrics["ppo_kl"]), ppo_kl))
-        self.assertTrue(torch.allclose(torch.tensor(metrics["pg_loss"]), dcppo_loss))
-
-
     def test_ppo_policy_loss(self):
         policy_loss_fn_cls = POLICY_LOSS_FN.get("ppo")
         policy_loss_fn_args = policy_loss_fn_cls.default_args()
@@ -130,7 +114,8 @@ def test_ppo_policy_loss_with_truncate_is(self):
         policy_loss_fn_cls = POLICY_LOSS_FN.get("ppo")
         policy_loss_fn_args = policy_loss_fn_cls.default_args()
         # Enable truncate large IS with default bounds [0.0, 2.0]
-        policy_loss_fn_args["truncate_large_is"] = True
+        policy_loss_fn_args["truncate_adv_pos_is"] = True
+        policy_loss_fn_args["truncate_adv_neg_is"] = True
         policy_loss_fn_args["truncate_is_range_low"] = 0.0
         policy_loss_fn_args["truncate_is_range_high"] = 2.0
         policy_loss_fn = policy_loss_fn_cls(**policy_loss_fn_args)
@@ -140,16 +125,23 @@ def test_ppo_policy_loss_with_truncate_is(self):
         ppo_loss_truncated = torch.tensor(0.2230827361345291)
         pg_clipfrac_truncated = torch.tensor(0.3541666567325592)
         ppo_kl_truncated = torch.tensor(-0.21663446724414825)
-        is_truncate_frac_expected = torch.tensor(0.2708333432674408)
+        is_truncate_frac_pos_expected = torch.tensor(0.0)
+        is_truncate_frac_neg_expected = torch.tensor(0.1041666641831398)
 
         self.assertTrue(torch.allclose(loss, ppo_loss_truncated))
         self.assertTrue(torch.allclose(torch.tensor(metrics["pg_clipfrac"]), pg_clipfrac_truncated))
         self.assertTrue(torch.allclose(torch.tensor(metrics["ppo_kl"]), ppo_kl_truncated))
         self.assertTrue(torch.allclose(torch.tensor(metrics["pg_loss"]), ppo_loss_truncated))
         # Check that IS truncation metric is present and has expected value
-        self.assertIn("is_truncate_frac", metrics)
+        self.assertIn("is_truncate_frac_pos", metrics)
+        self.assertIn("is_truncate_frac_neg", metrics)
+        self.assertTrue(
+            torch.allclose(torch.tensor(metrics["is_truncate_frac_pos"]), is_truncate_frac_pos_expected)
+        )
         self.assertTrue(
-            torch.allclose(torch.tensor(metrics["is_truncate_frac"]), is_truncate_frac_expected)
+            torch.allclose(torch.tensor(metrics["is_truncate_frac_neg"]), is_truncate_frac_neg_expected)
         )
-        self.assertGreaterEqual(metrics["is_truncate_frac"], 0.0)
-        self.assertLessEqual(metrics["is_truncate_frac"], 1.0)
+        self.assertGreaterEqual(metrics["is_truncate_frac_pos"], 0.0)
+        self.assertLessEqual(metrics["is_truncate_frac_pos"], 1.0)
+        self.assertGreaterEqual(metrics["is_truncate_frac_neg"], 0.0)
+        self.assertLessEqual(metrics["is_truncate_frac_neg"], 1.0)
diff --git a/trinity/algorithm/policy_loss_fn/__init__.py b/trinity/algorithm/policy_loss_fn/__init__.py
@@ -14,7 +14,6 @@
 from trinity.algorithm.policy_loss_fn.sft_loss import SFTLossFn
 from trinity.algorithm.policy_loss_fn.sppo_loss_fn import sPPOPolicyLossFn
 from trinity.algorithm.policy_loss_fn.topr_policy_loss import TOPRPolicyLossFn
-from trinity.algorithm.policy_loss_fn.dcppo_policy_loss import DualClipPPOPolicyLossFn
 
 __all__ = [
     "POLICY_LOSS_FN",
@@ -32,5 +31,4 @@
     "SFTPhiLossFn",
     "sPPOPolicyLossFn",
     "RECPolicyLossFn",
-    "DualClipPPOPolicyLossFn",
 ]
diff --git a/trinity/algorithm/policy_loss_fn/dcppo_policy_loss.py b/trinity/algorithm/policy_loss_fn/dcppo_policy_loss.py
diff --git a/trinity/algorithm/policy_loss_fn/ppo_policy_loss.py b/trinity/algorithm/policy_loss_fn/ppo_policy_loss.py
@@ -20,7 +20,8 @@ def __init__(
         clip_range_low: Optional[float] = None,
         clip_range_high: Optional[float] = None,
         loss_agg_mode: Optional[str] = "token-mean",
-        truncate_large_is: bool = False,
+        truncate_adv_pos_is: bool = False,
+        truncate_adv_neg_is: bool = False,
         truncate_is_range_low: Optional[float] = 0.0,
         truncate_is_range_high: Optional[float] = 2.0,
     ) -> None:
@@ -33,8 +34,12 @@ def __init__(
             clip_range_low: Lower bound for clipping (1.0 - clip_range_low)
             clip_range_high: Upper bound for clipping (1.0 + clip_range_high)
             loss_agg_mode: Loss aggregation mode (default: "token-mean")
-            truncate_large_is: Whether to truncate large importance sampling ratios
-                to handle calculation discrepancies between rollout and training engines
+            truncate_adv_pos_is: Whether to truncate large importance sampling ratios
+                when advantage is positive to handle calculation discrepancies between
+                rollout and training engines
+            truncate_adv_neg_is: Whether to truncate large importance sampling ratios
+                when advantage is negative to handle calculation discrepancies between
+                rollout and training engines
             truncate_is_range_low: Lower bound for IS ratio truncation (default: 0.0)
             truncate_is_range_high: Upper bound for IS ratio truncation (default: 2.0)
         """
@@ -52,17 +57,27 @@ def __init__(
         self.loss_agg_mode = loss_agg_mode
 
         # Truncate large IS configuration
-        self.truncate_large_is = truncate_large_is
-        if truncate_large_is:
+        self.truncate_adv_pos_is = truncate_adv_pos_is
+        self.truncate_adv_neg_is = truncate_adv_neg_is
+        if truncate_adv_pos_is:
             self.truncate_is_range_low = truncate_is_range_low
-            self.truncate_is_range_high = truncate_is_range_high
             assert (
                 self.truncate_is_range_low is not None
             ), "truncate_is_range_low must be specified."
+            assert (
+                self.truncate_is_range_low >= 0.0
+            ), "truncate_is_range_low must be non-negative."
+            assert (self.truncate_is_range_low < 1.0-self.clip_range_low
+            ), "truncate_is_range_low must be less than 1.0 - clip_range_low."
+        if truncate_adv_neg_is:
+            self.truncate_is_range_high = truncate_is_range_high
             assert (
                 self.truncate_is_range_high is not None
             ), "truncate_is_range_high must be specified."
-            assert self.truncate_is_range_low >= 0.0, "truncate_is_range_low must be non-negative."
+            assert (
+                self.truncate_is_range_high > 1.0+self.clip_range_high
+            ), "truncate_is_range_high must be greater than clip_range_high + 1.0."
+        if truncate_adv_pos_is and truncate_adv_neg_is:
             assert (
                 self.truncate_is_range_high > self.truncate_is_range_low
             ), "truncate_is_range_high must be greater than truncate_is_range_low."
@@ -79,36 +94,54 @@ def __call__(  # type: ignore
         ratio = torch.exp(negative_approx_kl)
         ppo_kl = masked_mean(-negative_approx_kl, action_mask)
 
-        # Truncate large IS ratios if enabled
-        # This helps stabilize training when there are calculation discrepancies between
-        # rollout and training engines, especially for small probabilities
-        if self.truncate_large_is:
-            # Track how often truncation occurs (before actually truncating)
-            # More efficient than cloning: directly check which values fall outside bounds
-            ratio_detached = ratio.detach()
-            is_truncate_frac = masked_mean(
-                (ratio_detached < self.truncate_is_range_low).float(), action_mask
-            ) + masked_mean((ratio_detached > self.truncate_is_range_high).float(), action_mask)
-            ratio = torch.clamp(ratio, self.truncate_is_range_low, self.truncate_is_range_high)
-
-        pg_losses = -advantages * ratio
+        # First clipping by clip_range, and calculate pg_clipfrac
+        pg_losses1 = -advantages * ratio
         pg_losses2 = -advantages * torch.clamp(
             ratio, 1.0 - self.clip_range_low, 1.0 + self.clip_range_high  # type: ignore
         )
+        pg_losses_clip = torch.maximum(pg_losses1, pg_losses2)
+        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses1).float(), action_mask)
+
+        # After clipped by clip_range, further truncate IS ratios if enabled
+        # This helps stabilize training when there are calculation discrepancies between
+        # rollout and training engines, especially for small probabilities
+        pg_truncfrac_pos, pg_truncfrac_neg = 0.0, 0.0
+        pg_losses_trunc = pg_losses_clip
+        
+        # Add IS truncation for positive advantages
+        if self.truncate_adv_pos_is:
+            pg_losses_pos_trunc = -advantages * self.truncate_is_range_low
+            pg_truncfrac_pos = masked_mean(
+                torch.lt(pg_losses_pos_trunc, pg_losses_trunc) * (advantages > 0).float(),
+                action_mask,
+            )
+            pg_losses_pos = torch.minimum(pg_losses_trunc, pg_losses_pos_trunc)
+            pg_losses_trunc = torch.where(advantages > 0, pg_losses_pos, pg_losses_trunc)
+
+        # Add IS truncation for negative advantages
+        if self.truncate_adv_neg_is:
+            pg_losses_neg_trunc = -advantages * self.truncate_is_range_high
+            pg_truncfrac_neg = masked_mean(
+                torch.lt(pg_losses_neg_trunc, pg_losses_trunc) * (advantages < 0).float(),
+                action_mask,
+            )
+            pg_losses_neg = torch.minimum(pg_losses_trunc, pg_losses_neg_trunc)
+            pg_losses_trunc = torch.where(advantages < 0, pg_losses_neg, pg_losses_trunc)
 
         pg_loss = masked_loss(
-            torch.max(pg_losses, pg_losses2), action_mask, loss_agg_mode=self.loss_agg_mode
+            pg_losses_trunc, action_mask, loss_agg_mode=self.loss_agg_mode
         )
-        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses).float(), action_mask)
         metrics = {
             "pg_clipfrac": pg_clipfrac.detach().item(),
             "ppo_kl": ppo_kl.detach().item(),
             "pg_loss": pg_loss.detach().item(),
         }
 
         # Add IS truncation metrics if enabled
-        if self.truncate_large_is:
-            metrics["is_truncate_frac"] = is_truncate_frac.detach().item()
+        if self.truncate_adv_pos_is:
+            metrics["is_truncate_frac_pos"] = pg_truncfrac_pos.detach().item()
+        if self.truncate_adv_neg_is:
+            metrics["is_truncate_frac_neg"] = pg_truncfrac_neg.detach().item()
 
         return pg_loss, metrics
 
@@ -117,7 +150,8 @@ def default_args(cls) -> Dict:
         return {
             "clip_range": 0.2,
             "loss_agg_mode": "token-mean",
-            "truncate_large_is": False,
+            "truncate_adv_pos_is": False,
+            "truncate_adv_neg_is": False,
             "truncate_is_range_low": 0.0,
             "truncate_is_range_high": 2.0,
         }