Fixes param names for rnd module

Mayankm96 · Mayankm96 · commit cc0eddeaa289 · 2025-03-05T12:52:21.000+01:00
diff --git a/config/dummy_config.yaml b/config/dummy_config.yaml
@@ -23,16 +23,12 @@ algorithm:
   rnd_cfg:
       weight: 0.0  # initial weight of the RND reward
 
-      # note: This is a dictionary with a required key called "mode" which can be one of "constant" or "step".
-      #   - If "constant", then the weight is constant.
-      #   - If "step", then the weight is updated using the step scheduler. The dictionary should contain additional parameters:
-      #     - max_num_steps: maximum number of steps to update the weight
-      #     - final_value: final value of the weight
-      # If None, then no scheduler is used.
+      # note: This is a dictionary with a required key called "mode".
+      #   Please check the RND module for more information.
       weight_schedule: null
 
       reward_normalization: false  # whether to normalize RND reward
-      gate_normalization: true  # whether to normalize RND gate observations
+      state_normalization: true  # whether to normalize RND state observations
 
       # -- Learning parameters
       learning_rate: 0.001  # learning rate for RND
@@ -72,6 +68,7 @@ policy:
   critic_hidden_dims: [128, 128, 128]
   init_noise_std: 1.0
   noise_std_type: "scalar"  # 'scalar' or 'log'
+
   # only needed for `ActorCriticRecurrent`
   # rnn_type: 'lstm'
   # rnn_hidden_size: 512
diff --git a/rsl_rl/modules/rnd.py b/rsl_rl/modules/rnd.py
@@ -31,7 +31,6 @@ def __init__(
         reward_normalization: bool = False,
         device: str = "cpu",
         weight_schedule: dict | None = None,
-        **kwargs,
     ):
         """Initialize the RND module.
 
@@ -58,13 +57,19 @@ def __init__(
                 It is a dictionary with the following keys:
 
                 - "mode": The type of schedule to use for the RND weight parameter.
-                - "max_num_steps": Maximum number of steps per episode. Used for the weight schedule of type "step".
-                - "final_value": Final value of the weight parameter. Used for the weight schedule of type "step".
+                    - "constant": Constant weight schedule.
+                    - "step": Step weight schedule.
+                    - "linear": Linear weight schedule.
 
-        Keyword Args:
+                For the "step" weight schedule, the following parameters are required:
 
-            max_num_steps (int): Maximum number of steps per episode. Used for the weight schedule of type "step".
-            final_value (float): Final value of the weight parameter. Used for the weight schedule of type "step".
+                - "final_step": The step at which the weight parameter is set to the final value.
+                - "final_value": The final value of the weight parameter.
+
+                For the "linear" weight schedule, the following parameters are required:
+                - "initial_step": The step at which the weight parameter is set to the initial value.
+                - "final_step": The step at which the weight parameter is set to the final value.
+                - "final_value": The final value of the weight parameter.
         """
         # initialize parent class
         super().__init__()
@@ -79,7 +84,7 @@ def __init__(
 
         # Normalization of input gates
         if state_normalization:
-            self.state_normalizer = EmpiricalNormalization(shape=[self.num_obs], until=1.0e8).to(self.device)
+            self.state_normalizer = EmpiricalNormalization(shape=[self.num_states], until=1.0e8).to(self.device)
         else:
             self.state_normalizer = torch.nn.Identity()
         # Normalization of intrinsic reward
@@ -101,14 +106,14 @@ def __init__(
         self.predictor = self._build_mlp(num_states, predictor_hidden_dims, num_outputs, activation).to(self.device)
         self.target = self._build_mlp(num_states, target_hidden_dims, num_outputs, activation).to(self.device)
 
-    def get_intrinsic_reward(self, gated_state) -> tuple[torch.Tensor, torch.Tensor]:
+    def get_intrinsic_reward(self, rnd_state) -> tuple[torch.Tensor, torch.Tensor]:
         # note: the counter is updated number of env steps per learning iteration
         self.update_counter += 1
-        # Normalize gated state
-        gated_state = self.state_normalizer(gated_state)
-        # Obtain the embedding of the gated state from the target and predictor networks
-        target_embedding = self.target(gated_state).detach()
-        predictor_embedding = self.predictor(gated_state).detach()
+        # Normalize rnd state
+        rnd_state = self.state_normalizer(rnd_state)
+        # Obtain the embedding of the rnd state from the target and predictor networks
+        target_embedding = self.target(rnd_state).detach()
+        predictor_embedding = self.predictor(rnd_state).detach()
         # Compute the intrinsic reward as the distance between the embeddings
         intrinsic_reward = torch.linalg.norm(target_embedding - predictor_embedding, dim=1)
         # Normalize intrinsic reward
@@ -122,7 +127,7 @@ def get_intrinsic_reward(self, gated_state) -> tuple[torch.Tensor, torch.Tensor]
         # Scale intrinsic reward
         intrinsic_reward *= self.weight
 
-        return intrinsic_reward, gated_state
+        return intrinsic_reward, rnd_state
 
     def forward(self, *args, **kwargs):
         raise RuntimeError("Forward method is not implemented. Use get_intrinsic_reward instead.")
@@ -171,8 +176,16 @@ def _build_mlp(input_dims: int, hidden_dims: list[int], output_dims: int, activa
     Different weight schedules.
     """
 
-    def _constant_weight_schedule(self, step, **kwargs):
+    def _constant_weight_schedule(self, step: int, **kwargs):
         return self.initial_weight
 
-    def _step_weight_schedule(self, step, max_num_steps: int, final_value: float, **kwargs):
-        return self.initial_weight if step < max_num_steps else final_value
+    def _step_weight_schedule(self, step: int, final_step: int, final_value: float, **kwargs):
+        return self.initial_weight if step < final_step else final_value
+
+    def _linear_weight_schedule(self, step: int, initial_step: int, final_step: int, final_value: float, **kwargs):
+        if step < initial_step:
+            return self.initial_weight
+        elif step > final_step:
+            return final_value
+        else:
+            return self.initial_weight + (final_value - self.initial_weight) * (step - initial_step) / (final_step - initial_step)
diff --git a/rsl_rl/runners/on_policy_runner.py b/rsl_rl/runners/on_policy_runner.py
@@ -49,9 +49,9 @@ def __init__(self, env: VecEnv, train_cfg: dict, log_dir: str | None = None, dev
             # get dimension of rnd gated state
             num_rnd_state = rnd_state.shape[1]
             # add rnd gated state to config
-            self.alg_cfg["rnd_cfg"]["num_state"] = num_rnd_state
+            self.alg_cfg["rnd_cfg"]["num_states"] = num_rnd_state
             # scale down the rnd weight with timestep (similar to how rewards are scaled down in legged_gym envs)
-            self.alg_cfg["rnd_cfg"]["weight"] *= env.dt
+            self.alg_cfg["rnd_cfg"]["weight"] *= env.unwrapped.step_dt
 
         # if using symmetry then pass the environment config object
         if self.alg_cfg.get("symmetry_cfg") is not None: