@@ -46,7 +46,6 @@ class RSPPO(OnPolicyAlgorithm):
46
46
no clipping will be done on the value function.
47
47
IMPORTANT: this clipping depends on the reward scaling.
48
48
:param normalize_advantage: Whether to normalize or not the advantage
49
- :param ent_coef: Entropy coefficient for the loss calculation
50
49
:param vf_coef: Value function coefficient for the loss calculation
51
50
:param max_grad_norm: The maximum value for the gradient clipping
52
51
:param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
@@ -90,7 +89,6 @@ def __init__(
90
89
clip_range : Union [float , Schedule ] = 0.2 ,
91
90
clip_range_vf : Union [None , float , Schedule ] = None ,
92
91
normalize_advantage : bool = True ,
93
- ent_coef : float = 0.0 ,
94
92
vf_coef : float = 0.5 ,
95
93
max_grad_norm : float = 0.5 ,
96
94
use_sde : bool = False ,
@@ -113,7 +111,6 @@ def __init__(
113
111
n_steps = n_steps ,
114
112
gamma = gamma ,
115
113
gae_lambda = gae_lambda ,
116
- ent_coef = ent_coef ,
117
114
vf_coef = vf_coef ,
118
115
max_grad_norm = max_grad_norm ,
119
116
use_sde = use_sde ,
@@ -126,6 +123,7 @@ def __init__(
126
123
verbose = verbose ,
127
124
device = device ,
128
125
seed = seed ,
126
+ ent_coef = 0 ,
129
127
_init_setup_model = False ,
130
128
supported_action_spaces = (
131
129
spaces .Box ,
@@ -195,7 +193,6 @@ def train(self) -> None:
195
193
if self .clip_range_vf is not None :
196
194
clip_range_vf = self .clip_range_vf (self ._current_progress_remaining ) # type: ignore[operator]
197
195
198
- entropy_losses = []
199
196
pg_losses , value_losses = [], []
200
197
clip_fractions = []
201
198
@@ -210,7 +207,7 @@ def train(self) -> None:
210
207
# Convert discrete action from float to long
211
208
actions = rollout_data .actions .long ().flatten ()
212
209
213
- values , log_prob , entropy = self .policy .evaluate_actions (rollout_data .observations , actions )
210
+ values , log_prob , _ = self .policy .evaluate_actions (rollout_data .observations , actions )
214
211
values = values .flatten ()
215
212
# Normalize advantage
216
213
advantages = rollout_data .advantages
@@ -246,16 +243,7 @@ def train(self) -> None:
246
243
value_loss = F .mse_loss (rollout_data .returns , values_pred )
247
244
value_losses .append (value_loss .item ())
248
245
249
- # Entropy loss favor exploration
250
- if entropy is None :
251
- # Approximate entropy when no analytical form
252
- entropy_loss = - th .mean (- log_prob )
253
- else :
254
- entropy_loss = - th .mean (entropy )
255
-
256
- entropy_losses .append (entropy_loss .item ())
257
-
258
- loss = policy_loss + self .ent_coef * entropy_loss + self .vf_coef * value_loss
246
+ loss = policy_loss + self .vf_coef * value_loss
259
247
260
248
# Calculate approximate form of reverse KL Divergence for early stopping
261
249
# see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
@@ -286,7 +274,6 @@ def train(self) -> None:
286
274
explained_var = explained_variance (self .rollout_buffer .values .flatten (), self .rollout_buffer .returns .flatten ())
287
275
288
276
# Logs
289
- self .logger .record ("train/entropy_loss" , np .mean (entropy_losses ))
290
277
self .logger .record ("train/policy_gradient_loss" , np .mean (pg_losses ))
291
278
self .logger .record ("train/value_loss" , np .mean (value_losses ))
292
279
self .logger .record ("train/approx_kl" , np .mean (approx_kl_divs ))
0 commit comments