Fix gamma/lam issue

yanxi-chen · yanxi-chen · commit b9346fdaa9e7 · 2025-05-29T14:07:34.000+08:00
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
@@ -182,10 +182,8 @@ class KL_Ctrl:
 
 @dataclass
 class Algorithm:
-    gamma: float = 1.0
-    lam: float = 1.0
     adv_estimator: str = "gae"
-    # TODO (yanxi): remove the above advantage-related parameters?
+    # TODO (yanxi): might remove adv_estimator completely, use AlgorithmConfig.advantage_fn_type instead
     norm_adv_by_std_in_grpo: bool = True
     use_kl_in_reward: bool = False
     kl_penalty: str = "kl"
@@ -316,20 +314,17 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
             self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
 
         # Algorithm related config
-        if config.algorithm.gamma is not None:
-            self.algorithm.gamma = config.algorithm.gamma
-        if config.algorithm.lam is not None:
-            self.algorithm.lam = config.algorithm.lam
         self.actor_rollout_ref.actor.algorithm_type = config.algorithm.algorithm_type
         if config.algorithm.algorithm_type == AlgorithmType.PPO:
             logger.info("Setting `adv_estimator` to 'gae' for PPO")
             self.algorithm.adv_estimator = AdvantageEstimator.GAE.value
         elif config.algorithm.algorithm_type in (AlgorithmType.GRPO, AlgorithmType.OPMD):
             logger.info("Setting `adv_estimator` to 'grpo' for GRPO/OPMD")
             self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value
-        # TODO (yanxi): it seems that adv_estimator only affects whether use_critic is set to
-        # True or False in RayPPOTrainer.__init__() (and hence in VerlPPOTrainerWrapper);
-        # need to double check whether this is indeed the case.
+        # TODO (yanxi): it seems that adv_estimator now only affects whether use_critic is set to
+        # True or False in RayPPOTrainer.__init__() (and hence in VerlPPOTrainerWrapper).
+        # Need to double check whether this is indeed the case,
+        # and see if adv_estimator can be removed completely.
 
         if self.actor_rollout_ref.actor.algorithm_type.is_dpo():  # for DPO
             if not self.actor_rollout_ref.actor.use_kl_loss:
diff --git a/trinity/trainer/verl/core_algos.py b/trinity/trainer/verl/core_algos.py
@@ -139,8 +139,8 @@ def compute_gae_advantage_return(
     token_level_rewards: torch.Tensor,
     values: torch.Tensor,
     eos_mask: torch.Tensor,
-    gamma: torch.Tensor,
-    lam: torch.Tensor,
+    gamma: float,
+    lam: float,
 ):
     """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py
 
@@ -283,7 +283,7 @@ def compute_rloo_outcome_advantage(
 
 
 def compute_reinforce_plus_plus_outcome_advantage(
-    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, gamma: torch.Tensor
+    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, gamma: float
 ):
     """
     Compute advantage for REINFORCE++.