leggedrobotics
diff --git a/‎config/dummy_config.yaml‎
Lines changed: 1 addition & 2 deletions b/‎config/dummy_config.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎rsl_rl/algorithms/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎rsl_rl/algorithms/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 118 additions & 0 deletions b/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎rsl_rl/algorithms/ppo.py‎
Lines changed: 48 additions & 42 deletions b/‎rsl_rl/algorithms/ppo.py‎
Lines changed: 48 additions & 42 deletions
diff --git a/‎rsl_rl/modules/__init__.py‎
Lines changed: 8 additions & 1 deletion b/‎rsl_rl/modules/__init__.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎rsl_rl/modules/actor_critic.py‎
Lines changed: 16 additions & 4 deletions b/‎rsl_rl/modules/actor_critic.py‎
Lines changed: 16 additions & 4 deletions
@@ -51,7 +51,7 @@ algorithm:
     #
     #     @torch.no_grad()
     #     def get_symmetric_states(
-    #        obs: Optional[torch.Tensor] = None, actions: Optional[torch.Tensor] = None, cfg: "BaseEnvCfg" = None, is_critic: bool = False,
+    #        obs: Optional[torch.Tensor] = None, actions: Optional[torch.Tensor] = None, cfg: "BaseEnvCfg" = None, obs_type: str = "policy"
     #     ) -> Tuple[torch.Tensor, torch.Tensor]:
     #
     data_augmentation_func: null
@@ -87,7 +87,6 @@ runner:
     neptune_project: legged_gym
     wandb_project: legged_gym
     # -- load and resuming
-    resume: false
     load_run: -1  # -1 means load latest run
     resume_path: null  # updated from load_run and checkpoint
     checkpoint: -1  # -1 means load latest checkpoint
 
@@ -5,6 +5,7 @@
 
 """Implementation of different RL agents."""
 
+from .distillation import Distillation
 from .ppo import PPO
 
-__all__ = ["PPO"]
+__all__ = ["PPO", "Distillation"]
@@ -0,0 +1,118 @@
+# Copyright (c) 2021-2025, ETH Zurich and NVIDIA CORPORATION
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# torch
+import torch.nn as nn
+import torch.optim as optim
+
+# rsl-rl
+from rsl_rl.modules import StudentTeacher
+from rsl_rl.storage import RolloutStorage
+
+
+class Distillation:
+    """Distillation algorithm for training a student model to mimic a teacher model."""
+
+    policy: StudentTeacher
+    """The student teacher model."""
+
+    def __init__(
+        self,
+        policy,
+        num_learning_epochs=1,
+        gradient_length=15,
+        learning_rate=1e-3,
+        device="cpu",
+    ):
+        self.device = device
+        self.learning_rate = learning_rate
+
+        self.rnd = None  # TODO: remove when runner has a proper base class
+
+        # distillation components
+        self.policy = policy
+        self.policy.to(self.device)
+        self.storage = None  # initialized later
+        self.optimizer = optim.Adam(self.policy.student.parameters(), lr=self.learning_rate)
+        self.transition = RolloutStorage.Transition()
+
+        # distillation parameters
+        self.num_learning_epochs = num_learning_epochs
+        self.gradient_length = gradient_length
+
+        self.num_updates = 0
+
+    def init_storage(
+        self, training_type, num_envs, num_transitions_per_env, student_obs_shape, teacher_obs_shape, actions_shape
+    ):
+        # create rollout storage
+        self.storage = RolloutStorage(
+            training_type,
+            num_envs,
+            num_transitions_per_env,
+            student_obs_shape,
+            teacher_obs_shape,
+            actions_shape,
+            None,
+            self.device,
+        )
+
+    def act(self, obs, teacher_obs):
+        # compute the actions
+        self.transition.actions = self.policy.act(obs).detach()
+        self.transition.privileged_actions = self.policy.evaluate(teacher_obs).detach()
+        # record the observations
+        self.transition.observations = obs
+        self.transition.privileged_observations = teacher_obs
+        return self.transition.actions
+
+    def process_env_step(self, rewards, dones, infos):
+        # record the rewards and dones
+        self.transition.rewards = rewards
+        self.transition.dones = dones
+        # record the transition
+        self.storage.add_transitions(self.transition)
+        self.transition.clear()
+        self.policy.reset(dones)
+
+    def update(self):
+        self.num_updates += 1
+        mean_behaviour_loss = 0
+        loss = 0
+        cnt = 0
+
+        for epoch in range(self.num_learning_epochs):  # TODO unify num_steps_per_env and gradient_length
+            self.policy.reset()
+            self.policy.detach_hidden_states()
+            for obs, _, _, privileged_actions in self.storage.generator():
+
+                # inference the student for gradient computation
+                actions = self.policy.act_inference(obs)
+
+                # behaviour cloning loss
+                behaviour_loss = nn.functional.mse_loss(actions, privileged_actions)
+
+                # total loss
+                loss = loss + behaviour_loss
+
+                mean_behaviour_loss += behaviour_loss.item()
+                cnt += 1
+
+                # gradient step
+                if cnt % self.gradient_length == 0:
+                    self.optimizer.zero_grad()
+                    loss.backward()
+                    self.optimizer.step()
+                    self.policy.detach_hidden_states()
+                    loss = 0
+
+        mean_behaviour_loss /= cnt
+        self.storage.clear()
+        self.policy.reset()  # TODO needed?
+
+        # construct the loss dictionary
+        loss_dict = {"behaviour": mean_behaviour_loss}
+
+        return loss_dict
@@ -19,12 +19,12 @@
 class PPO:
     """Proximal Policy Optimization algorithm (https://arxiv.org/abs/1707.06347)."""
 
-    actor_critic: ActorCritic
+    policy: ActorCritic
     """The actor critic module."""
 
     def __init__(
         self,
-        actor_critic,
+        policy,
         num_learning_epochs=1,
         num_mini_batches=1,
         clip_param=0.2,
@@ -84,10 +84,10 @@ def __init__(
             self.symmetry = None
 
         # PPO components
-        self.actor_critic = actor_critic
-        self.actor_critic.to(self.device)
+        self.policy = policy
+        self.policy.to(self.device)
         # Create optimizer
-        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate)
+        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
         # Create rollout storage
         self.storage: RolloutStorage = None  # type: ignore
         self.transition = RolloutStorage.Transition()
@@ -103,41 +103,38 @@ def __init__(
         self.max_grad_norm = max_grad_norm
         self.use_clipped_value_loss = use_clipped_value_loss
 
-    def init_storage(self, num_envs, num_transitions_per_env, actor_obs_shape, critic_obs_shape, action_shape):
+    def init_storage(
+        self, training_type, num_envs, num_transitions_per_env, actor_obs_shape, critic_obs_shape, actions_shape
+    ):
         # create memory for RND as well :)
         if self.rnd:
             rnd_state_shape = [self.rnd.num_states]
         else:
             rnd_state_shape = None
         # create rollout storage
         self.storage = RolloutStorage(
+            training_type,
             num_envs,
             num_transitions_per_env,
             actor_obs_shape,
             critic_obs_shape,
-            action_shape,
+            actions_shape,
             rnd_state_shape,
             self.device,
         )
 
-    def test_mode(self):
-        self.actor_critic.test()
-
-    def train_mode(self):
-        self.actor_critic.train()
-
     def act(self, obs, critic_obs):
-        if self.actor_critic.is_recurrent:
-            self.transition.hidden_states = self.actor_critic.get_hidden_states()
-        # Compute the actions and values
-        self.transition.actions = self.actor_critic.act(obs).detach()
-        self.transition.values = self.actor_critic.evaluate(critic_obs).detach()
-        self.transition.actions_log_prob = self.actor_critic.get_actions_log_prob(self.transition.actions).detach()
-        self.transition.action_mean = self.actor_critic.action_mean.detach()
-        self.transition.action_sigma = self.actor_critic.action_std.detach()
+        if self.policy.is_recurrent:
+            self.transition.hidden_states = self.policy.get_hidden_states()
+        # compute the actions and values
+        self.transition.actions = self.policy.act(obs).detach()
+        self.transition.values = self.policy.evaluate(critic_obs).detach()
+        self.transition.actions_log_prob = self.policy.get_actions_log_prob(self.transition.actions).detach()
+        self.transition.action_mean = self.policy.action_mean.detach()
+        self.transition.action_sigma = self.policy.action_std.detach()
         # need to record obs and critic_obs before env.step()
         self.transition.observations = obs
-        self.transition.critic_observations = critic_obs
+        self.transition.privileged_observations = critic_obs
         return self.transition.actions
 
     def process_env_step(self, rewards, dones, infos):
@@ -164,14 +161,14 @@ def process_env_step(self, rewards, dones, infos):
                 self.transition.values * infos["time_outs"].unsqueeze(1).to(self.device), 1
             )
 
-        # Record the transition
+        # record the transition
         self.storage.add_transitions(self.transition)
         self.transition.clear()
-        self.actor_critic.reset(dones)
+        self.policy.reset(dones)
 
     def compute_returns(self, last_critic_obs):
         # compute value for the last step
-        last_values = self.actor_critic.evaluate(last_critic_obs).detach()
+        last_values = self.policy.evaluate(last_critic_obs).detach()
         self.storage.compute_returns(
             last_values, self.gamma, self.lam, normalize_advantage=not self.normalize_advantage_per_mini_batch
         )
@@ -192,7 +189,7 @@ def update(self):  # noqa: C901
             mean_symmetry_loss = None
 
         # generator for mini batches
-        if self.actor_critic.is_recurrent:
+        if self.policy.is_recurrent:
             generator = self.storage.recurrent_mini_batch_generator(self.num_mini_batches, self.num_learning_epochs)
         else:
             generator = self.storage.mini_batch_generator(self.num_mini_batches, self.num_learning_epochs)
@@ -230,10 +227,10 @@ def update(self):  # noqa: C901
                 data_augmentation_func = self.symmetry["data_augmentation_func"]
                 # returned shape: [batch_size * num_aug, ...]
                 obs_batch, actions_batch = data_augmentation_func(
-                    obs=obs_batch, actions=actions_batch, env=self.symmetry["_env"], is_critic=False
+                    obs=obs_batch, actions=actions_batch, env=self.symmetry["_env"], obs_type="policy"
                 )
                 critic_obs_batch, _ = data_augmentation_func(
-                    obs=critic_obs_batch, actions=None, env=self.symmetry["_env"], is_critic=True
+                    obs=critic_obs_batch, actions=None, env=self.symmetry["_env"], obs_type="critic"
                 )
                 # compute number of augmentations per sample
                 num_aug = int(obs_batch.shape[0] / original_batch_size)
@@ -246,19 +243,17 @@ def update(self):  # noqa: C901
                 returns_batch = returns_batch.repeat(num_aug, 1)
 
             # Recompute actions log prob and entropy for current batch of transitions
-            # Note: we need to do this because we updated the actor_critic with the new parameters
+            # Note: we need to do this because we updated the policy with the new parameters
             # -- actor
-            self.actor_critic.act(obs_batch, masks=masks_batch, hidden_states=hid_states_batch[0])
-            actions_log_prob_batch = self.actor_critic.get_actions_log_prob(actions_batch)
+            self.policy.act(obs_batch, masks=masks_batch, hidden_states=hid_states_batch[0])
+            actions_log_prob_batch = self.policy.get_actions_log_prob(actions_batch)
             # -- critic
-            value_batch = self.actor_critic.evaluate(
-                critic_obs_batch, masks=masks_batch, hidden_states=hid_states_batch[1]
-            )
+            value_batch = self.policy.evaluate(critic_obs_batch, masks=masks_batch, hidden_states=hid_states_batch[1])
             # -- entropy
             # we only keep the entropy of the first augmentation (the original one)
-            mu_batch = self.actor_critic.action_mean[:original_batch_size]
-            sigma_batch = self.actor_critic.action_std[:original_batch_size]
-            entropy_batch = self.actor_critic.entropy[:original_batch_size]
+            mu_batch = self.policy.action_mean[:original_batch_size]
+            sigma_batch = self.policy.action_std[:original_batch_size]
+            entropy_batch = self.policy.entropy[:original_batch_size]
 
             # KL
             if self.desired_kl is not None and self.schedule == "adaptive":
@@ -308,21 +303,21 @@ def update(self):  # noqa: C901
                 if not self.symmetry["use_data_augmentation"]:
                     data_augmentation_func = self.symmetry["data_augmentation_func"]
                     obs_batch, _ = data_augmentation_func(
-                        obs=obs_batch, actions=None, env=self.symmetry["_env"], is_critic=False
+                        obs=obs_batch, actions=None, env=self.symmetry["_env"], obs_type="policy"
                     )
                     # compute number of augmentations per sample
                     num_aug = int(obs_batch.shape[0] / original_batch_size)
 
                 # actions predicted by the actor for symmetrically-augmented observations
-                mean_actions_batch = self.actor_critic.act_inference(obs_batch.detach().clone())
+                mean_actions_batch = self.policy.act_inference(obs_batch.detach().clone())
 
                 # compute the symmetrically augmented actions
                 # note: we are assuming the first augmentation is the original one.
                 #   We do not use the action_batch from earlier since that action was sampled from the distribution.
                 #   However, the symmetry loss is computed using the mean of the distribution.
                 action_mean_orig = mean_actions_batch[:original_batch_size]
                 _, actions_mean_symm_batch = data_augmentation_func(
-                    obs=None, actions=action_mean_orig, env=self.symmetry["_env"], is_critic=False
+                    obs=None, actions=action_mean_orig, env=self.symmetry["_env"], obs_type="policy"
                 )
 
                 # compute the loss (we skip the first augmentation as it is the original one)
@@ -349,7 +344,7 @@ def update(self):  # noqa: C901
             # -- For PPO
             self.optimizer.zero_grad()
             loss.backward()
-            nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm)
+            nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
             self.optimizer.step()
             # -- For RND
             if self.rnd_optimizer:
@@ -382,4 +377,15 @@ def update(self):  # noqa: C901
         # -- Clear the storage
         self.storage.clear()
 
-        return mean_value_loss, mean_surrogate_loss, mean_entropy, mean_rnd_loss, mean_symmetry_loss
+        # construct the loss dictionary
+        loss_dict = {
+            "value_function": mean_value_loss,
+            "surrogate": mean_surrogate_loss,
+            "entropy": mean_entropy,
+        }
+        if self.rnd:
+            loss_dict["rnd"] = mean_rnd_loss
+        if self.symmetry:
+            loss_dict["symmetry"] = mean_symmetry_loss
+
+        return loss_dict
@@ -9,5 +9,12 @@
 from .actor_critic_recurrent import ActorCriticRecurrent
 from .normalizer import EmpiricalNormalization
 from .rnd import RandomNetworkDistillation
+from .student_teacher import StudentTeacher
 
-__all__ = ["ActorCritic", "ActorCriticRecurrent", "EmpiricalNormalization", "RandomNetworkDistillation"]
+__all__ = [
+    "ActorCritic",
+    "ActorCriticRecurrent",
+    "EmpiricalNormalization",
+    "RandomNetworkDistillation",
+    "StudentTeacher",
+]
@@ -78,10 +78,6 @@ def __init__(
         # disable args validation for speedup
         Normal.set_default_validate_args(False)
 
-        # seems that we get better performance without init
-        # self.init_memory_weights(self.memory_a, 0.001, 0.)
-        # self.init_memory_weights(self.memory_c, 0.001, 0.)
-
     @staticmethod
     # not used at the moment
     def init_weights(sequential, scales):
@@ -135,3 +131,19 @@ def act_inference(self, observations):
     def evaluate(self, critic_observations, **kwargs):
         value = self.critic(critic_observations)
         return value
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the parameters of the actor-critic model.
+
+        Args:
+            state_dict (dict): State dictionary of the model.
+            strict (bool): Whether to strictly enforce that the keys in state_dict match the keys returned by this
+                           module's state_dict() function.
+
+        Returns:
+            bool: Whether this training resumes a previous training. This flag is used by the `load()` function of
+                  `OnPolicyRunner` to determine how to load further parameters (relevant for, e.g., distillation).
+        """
+
+        super().load_state_dict(state_dict, strict=strict)
+        return True