Updates from deployable-rl

yardenas · yardenas · commit 3be540e7db3b · 2024-06-27T12:59:54.000+02:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dm-env = "^1.6"
 distrax = "^0.1.5"
 pillow = "^10.2.0"
 moviepy = "^1.0.3"
-safe-adaptation-gym = {git = "https://git@github.com/lasgroup/safe-adaptation-gym.git"}
+safe-adaptation-gym = {git = "ssh://git@github.com/lasgroup/safe-adaptation-gym"}
 jmp = {git = "https://github.com/deepmind/jmp"}
 tensorboard = "^2.16.2"
 
diff --git a/safe_opax/configs/config.yaml b/safe_opax/configs/config.yaml
@@ -42,7 +42,7 @@ training:
   episodes_per_epoch: 5
   epochs: 200
   action_repeat: 1
-  render_episodes: 1
+  render_episodes: 0
   parallel_envs: 10
   scale_reward: 1.
   exploration_steps: 5000
diff --git a/safe_opax/configs/experiment/safety_gym_doggo.yaml b/safe_opax/configs/experiment/safety_gym_doggo.yaml
@@ -6,7 +6,7 @@ training:
   epochs: 200
   safe: true
   action_repeat: 2
-  episodes_per_epoch: 10
+  episodes_per_epoch: 5
 
 environment:
   safe_adaptation_gym:
diff --git a/safe_opax/la_mbda/actor_critic.py b/safe_opax/la_mbda/actor_critic.py
@@ -9,18 +9,6 @@
 from safe_opax.rl.utils import rl_initialize_weights_trick
 
 
-class StableTanh(trx.Tanh):
-    def inverse_and_log_det(self, y):
-        dtype = y.dtype
-        y = y.astype(jnp.float32)
-        # Clip to avoid computing very large gradients outside of
-        # the given range.
-        y = jnp.clip(y, -0.99999997, 0.99999997)
-        x = jnp.arctanh(y)
-        x = x.astype(dtype)
-        return x, -self.forward_log_det_jacobian(x)
-
-
 class ContinuousActor(eqx.Module):
     net: eqx.nn.MLP
     init_stddev: float = eqx.static_field()
@@ -55,9 +43,8 @@ def __call__(self, state: jax.Array) -> trx.Transformed:
         init_std = inv_softplus(self.init_stddev)
         stddev = jnn.softplus(stddev + init_std) + 1e-4
         mu = 5.0 * jnn.tanh(mu / 5.0)
-        dist = trx.MultivariateNormalDiag(mu, stddev)
-        bijector = trx.Block(StableTanh(), 1)
-        dist = trx.Transformed(dist, bijector)
+        dist = trx.Normal(mu, stddev)
+        dist = trx.Transformed(dist, trx.Tanh())
         return dist
 
     def act(
diff --git a/safe_opax/la_mbda/safe_actor_critic.py b/safe_opax/la_mbda/safe_actor_critic.py
@@ -196,8 +196,7 @@ def evaluate_actor(
     objective_sentiment: Sentiment,
     constraint_sentiment: Sentiment,
 ) -> ActorEvaluation:
-    keys = jnp.asarray(jax.random.split(key, initial_states.shape[0]))
-    trajectories, priors = rollout_fn(horizon, initial_states, keys, actor.act)
+    trajectories, priors = rollout_fn(horizon, initial_states, key, actor.act)
     next_step = lambda x: x[:, 1:]
     current_step = lambda x: x[:, :-1]
     next_states = next_step(trajectories.next_state)