From c05c67a6556712b1218b92ee6b9e6eae52c0bf13 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 31 Mar 2025 17:41:15 +0200 Subject: [PATCH 1/2] Fix recurrent buffer not using env change for resetting state --- sb3_contrib/common/recurrent/buffers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sb3_contrib/common/recurrent/buffers.py b/sb3_contrib/common/recurrent/buffers.py index 5386db11..8cdc525a 100644 --- a/sb3_contrib/common/recurrent/buffers.py +++ b/sb3_contrib/common/recurrent/buffers.py @@ -228,6 +228,11 @@ def _get_samples( lstm_states_pi = (self.to_torch(lstm_states_pi[0]).contiguous(), self.to_torch(lstm_states_pi[1]).contiguous()) lstm_states_vf = (self.to_torch(lstm_states_vf[0]).contiguous(), self.to_torch(lstm_states_vf[1]).contiguous()) + # See issue GH#284 + episode_starts = np.logical_or(self.episode_starts[batch_inds], env_change[batch_inds]).astype( + self.episode_starts.dtype + ) + return RecurrentRolloutBufferSamples( # (batch_size, obs_dim) -> (n_seq, max_length, obs_dim) -> (n_seq * max_length, obs_dim) observations=self.pad(self.observations[batch_inds]).reshape((padded_batch_size, *self.obs_shape)), @@ -237,7 +242,7 @@ def _get_samples( advantages=self.pad_and_flatten(self.advantages[batch_inds]), returns=self.pad_and_flatten(self.returns[batch_inds]), lstm_states=RNNStates(lstm_states_pi, lstm_states_vf), - episode_starts=self.pad_and_flatten(self.episode_starts[batch_inds]), + episode_starts=self.pad_and_flatten(episode_starts), mask=self.pad_and_flatten(np.ones_like(self.returns[batch_inds])), ) @@ -372,6 +377,10 @@ def _get_samples( observations = {key: self.pad(obs[batch_inds]) for (key, obs) in self.observations.items()} observations = {key: obs.reshape((padded_batch_size,) + self.obs_shape[key]) for (key, obs) in observations.items()} + episode_starts = np.logical_or(self.episode_starts[batch_inds], env_change[batch_inds]).astype( + self.episode_starts.dtype + ) + return RecurrentDictRolloutBufferSamples( observations=observations, actions=self.pad(self.actions[batch_inds]).reshape((padded_batch_size,) + self.actions.shape[1:]), @@ -380,6 +389,6 @@ def _get_samples( advantages=self.pad_and_flatten(self.advantages[batch_inds]), returns=self.pad_and_flatten(self.returns[batch_inds]), lstm_states=RNNStates(lstm_states_pi, lstm_states_vf), - episode_starts=self.pad_and_flatten(self.episode_starts[batch_inds]), + episode_starts=self.pad_and_flatten(episode_starts), mask=self.pad_and_flatten(np.ones_like(self.returns[batch_inds])), ) From 3ad432b7bda2ea4825fbf8bf9a76f6342258e6d8 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 31 Mar 2025 17:42:03 +0200 Subject: [PATCH 2/2] Fixes for newer pytorch version --- pyproject.toml | 1 + sb3_contrib/common/torch_layers.py | 1 - sb3_contrib/crossq/crossq.py | 2 +- sb3_contrib/tqc/tqc.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f19c2c0..3745357d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ exclude = """(?x)( | sb3_contrib/ars/ars.py$ | sb3_contrib/common/recurrent/policies.py$ | sb3_contrib/common/recurrent/buffers.py$ + | sb3_contrib/common/torch_layers.py$ | tests/test_train_eval_mode.py$ )""" diff --git a/sb3_contrib/common/torch_layers.py b/sb3_contrib/common/torch_layers.py index 2605441e..c02770fb 100644 --- a/sb3_contrib/common/torch_layers.py +++ b/sb3_contrib/common/torch_layers.py @@ -47,7 +47,6 @@ def __init__( self.affine = affine self.eps = eps - self.step = 0 self.momentum = momentum self.num_features = num_features # Clip scale and bias of the affine transform diff --git a/sb3_contrib/crossq/crossq.py b/sb3_contrib/crossq/crossq.py index 1b7f90b8..e2d36ec8 100644 --- a/sb3_contrib/crossq/crossq.py +++ b/sb3_contrib/crossq/crossq.py @@ -274,7 +274,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Optimize entropy coefficient, also called entropy temperature or alpha in the paper if self.ent_coef_optimizer is not None: - ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() + ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() # type: ignore[operator] ent_coef_losses.append(ent_coef_loss.item()) self.ent_coef_optimizer.zero_grad() diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index 99b914c6..9c08ea69 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -222,7 +222,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) - ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() + ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() # type: ignore[operator] ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor