Removed entropy term (#1)

aarunsrinivas5 · aarunsrinivas5 · commit eeedda810e44 · 2025-04-06T12:16:28.000-04:00
diff --git a/example.py b/example.py
@@ -1,11 +1,11 @@
 import numpy as np
 import gymnasium as gym
-from stable_baselines3 import PPO
+from stable_baselines3 import PPO, RSPPO
 from stable_baselines3.common.utils import set_random_seed
 
 set_random_seed(42)
 
 env = gym.make('CartPole-v1')
-model = PPO('MlpPolicy', env, verbose=1)
+model = RSPPO('MlpPolicy', env, verbose=1)
 
 model.learn(total_timesteps=1e6)
diff --git a/stable_baselines3/rsppo/rsppo.py b/stable_baselines3/rsppo/rsppo.py
@@ -46,7 +46,6 @@ class RSPPO(OnPolicyAlgorithm):
         no clipping will be done on the value function.
         IMPORTANT: this clipping depends on the reward scaling.
     :param normalize_advantage: Whether to normalize or not the advantage
-    :param ent_coef: Entropy coefficient for the loss calculation
     :param vf_coef: Value function coefficient for the loss calculation
     :param max_grad_norm: The maximum value for the gradient clipping
     :param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
@@ -90,7 +89,6 @@ def __init__(
         clip_range: Union[float, Schedule] = 0.2,
         clip_range_vf: Union[None, float, Schedule] = None,
         normalize_advantage: bool = True,
-        ent_coef: float = 0.0,
         vf_coef: float = 0.5,
         max_grad_norm: float = 0.5,
         use_sde: bool = False,
@@ -113,7 +111,6 @@ def __init__(
             n_steps=n_steps,
             gamma=gamma,
             gae_lambda=gae_lambda,
-            ent_coef=ent_coef,
             vf_coef=vf_coef,
             max_grad_norm=max_grad_norm,
             use_sde=use_sde,
@@ -126,6 +123,7 @@ def __init__(
             verbose=verbose,
             device=device,
             seed=seed,
+            ent_coef=0,
             _init_setup_model=False,
             supported_action_spaces=(
                 spaces.Box,
@@ -195,7 +193,6 @@ def train(self) -> None:
         if self.clip_range_vf is not None:
             clip_range_vf = self.clip_range_vf(self._current_progress_remaining)  # type: ignore[operator]
 
-        entropy_losses = []
         pg_losses, value_losses = [], []
         clip_fractions = []
 
@@ -210,7 +207,7 @@ def train(self) -> None:
                     # Convert discrete action from float to long
                     actions = rollout_data.actions.long().flatten()
 
-                values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
+                values, log_prob, _ = self.policy.evaluate_actions(rollout_data.observations, actions)
                 values = values.flatten()
                 # Normalize advantage
                 advantages = rollout_data.advantages
@@ -246,16 +243,7 @@ def train(self) -> None:
                 value_loss = F.mse_loss(rollout_data.returns, values_pred)
                 value_losses.append(value_loss.item())
 
-                # Entropy loss favor exploration
-                if entropy is None:
-                    # Approximate entropy when no analytical form
-                    entropy_loss = -th.mean(-log_prob)
-                else:
-                    entropy_loss = -th.mean(entropy)
-
-                entropy_losses.append(entropy_loss.item())
-
-                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
+                loss = policy_loss + self.vf_coef * value_loss
 
                 # Calculate approximate form of reverse KL Divergence for early stopping
                 # see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
@@ -286,7 +274,6 @@ def train(self) -> None:
         explained_var = explained_variance(self.rollout_buffer.values.flatten(), self.rollout_buffer.returns.flatten())
 
         # Logs
-        self.logger.record("train/entropy_loss", np.mean(entropy_losses))
         self.logger.record("train/policy_gradient_loss", np.mean(pg_losses))
         self.logger.record("train/value_loss", np.mean(value_losses))
         self.logger.record("train/approx_kl", np.mean(approx_kl_divs))