@@ -46,7 +46,6 @@ class RSPPO(OnPolicyAlgorithm):
46
46
no clipping will be done on the value function.
47
47
IMPORTANT: this clipping depends on the reward scaling.
48
48
:param normalize_advantage: Whether to normalize or not the advantage
49
- :param ent_coef: Entropy coefficient for the loss calculation
50
49
:param vf_coef: Value function coefficient for the loss calculation
51
50
:param max_grad_norm: The maximum value for the gradient clipping
52
51
:param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
@@ -90,7 +89,6 @@ def __init__(
90
89
clip_range : Union [float , Schedule ] = 0.2 ,
91
90
clip_range_vf : Union [None , float , Schedule ] = None ,
92
91
normalize_advantage : bool = True ,
93
- ent_coef : float = 0.0 ,
94
92
vf_coef : float = 0.5 ,
95
93
max_grad_norm : float = 0.5 ,
96
94
use_sde : bool = False ,
@@ -113,7 +111,6 @@ def __init__(
113
111
n_steps = n_steps ,
114
112
gamma = gamma ,
115
113
gae_lambda = gae_lambda ,
116
- ent_coef = ent_coef ,
117
114
vf_coef = vf_coef ,
118
115
max_grad_norm = max_grad_norm ,
119
116
use_sde = use_sde ,
@@ -195,7 +192,6 @@ def train(self) -> None:
195
192
if self .clip_range_vf is not None :
196
193
clip_range_vf = self .clip_range_vf (self ._current_progress_remaining ) # type: ignore[operator]
197
194
198
- entropy_losses = []
199
195
pg_losses , value_losses = [], []
200
196
clip_fractions = []
201
197
@@ -210,7 +206,7 @@ def train(self) -> None:
210
206
# Convert discrete action from float to long
211
207
actions = rollout_data .actions .long ().flatten ()
212
208
213
- values , log_prob , entropy = self .policy .evaluate_actions (rollout_data .observations , actions )
209
+ values , log_prob , _ = self .policy .evaluate_actions (rollout_data .observations , actions )
214
210
values = values .flatten ()
215
211
# Normalize advantage
216
212
advantages = rollout_data .advantages
@@ -246,16 +242,7 @@ def train(self) -> None:
246
242
value_loss = F .mse_loss (rollout_data .returns , values_pred )
247
243
value_losses .append (value_loss .item ())
248
244
249
- # Entropy loss favor exploration
250
- if entropy is None :
251
- # Approximate entropy when no analytical form
252
- entropy_loss = - th .mean (- log_prob )
253
- else :
254
- entropy_loss = - th .mean (entropy )
255
-
256
- entropy_losses .append (entropy_loss .item ())
257
-
258
- loss = policy_loss + self .ent_coef * entropy_loss + self .vf_coef * value_loss
245
+ loss = policy_loss + self .vf_coef * value_loss
259
246
260
247
# Calculate approximate form of reverse KL Divergence for early stopping
261
248
# see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
@@ -286,7 +273,6 @@ def train(self) -> None:
286
273
explained_var = explained_variance (self .rollout_buffer .values .flatten (), self .rollout_buffer .returns .flatten ())
287
274
288
275
# Logs
289
- self .logger .record ("train/entropy_loss" , np .mean (entropy_losses ))
290
276
self .logger .record ("train/policy_gradient_loss" , np .mean (pg_losses ))
291
277
self .logger .record ("train/value_loss" , np .mean (value_losses ))
292
278
self .logger .record ("train/approx_kl" , np .mean (approx_kl_divs ))
0 commit comments