reward updates (#497)

codekansas · web-flow · commit 06c92a8fe1b8 · 2025-08-09T02:27:32.000-07:00
* reward updates

* multiple reward outputs

* training fixes

* nit changes

* joysticl

* oh man, much better

* tweaked rewards

* reward changes

* norm change
diff --git a/examples/walking.py b/examples/walking.py
@@ -359,8 +359,8 @@ def get_mujoco_model(self) -> mujoco.MjModel:  # pyright: ignore[reportAttribute
     def get_mujoco_model_metadata(self, mj_model: mujoco.MjModel) -> ksim.Metadata:  # pyright: ignore[reportAttributeAccessIssue]
         return ksim.Metadata.from_model(
             mj_model,
-            kp=10.0,
-            kd=0.1,
+            kp=50.0,
+            kd=1.0,
         )
 
     def get_actuators(
@@ -461,7 +461,7 @@ def get_commands(self, physics_model: ksim.PhysicsModel) -> list[ksim.Command]:
                     gait_period=self.config.gait_period,
                     ctrl_dt=self.config.ctrl_dt,
                     max_height=self.config.max_foot_height,
-                    height_offset=0.04,
+                    height_offset=0.08,
                 ),
                 joystick=ksim.JoystickCommand(
                     run_speed=self.config.target_linear_velocity,
@@ -470,15 +470,12 @@ def get_commands(self, physics_model: ksim.PhysicsModel) -> list[ksim.Command]:
                     rotation_speed=self.config.target_angular_velocity,
                 ),
             ),
-            ksim.BaseHeightCommand(
-                min_height=0.9,
-                max_height=1.4,
-            ),
         ]
 
     def get_rewards(self, physics_model: ksim.PhysicsModel) -> list[ksim.Reward]:
         return [
             ksim.StayAliveReward(scale=100.0),
+            ksim.UprightReward(scale=5.0),
             ksim.EasyJoystickReward(
                 gait=ksim.SinusoidalGaitReward(
                     scale=5.0,
@@ -492,7 +489,6 @@ def get_rewards(self, physics_model: ksim.PhysicsModel) -> list[ksim.Reward]:
                     scale=1.0,
                 ),
             ),
-            ksim.BaseHeightTrackingReward(scale=5.0),
         ]
 
     def get_terminations(self, physics_model: ksim.PhysicsModel) -> list[ksim.Termination]:
@@ -513,8 +509,8 @@ def get_model(self, params: ksim.InitParams) -> Model:
         return Model(
             params.key,
             physics_model=params.physics_model,
-            num_actor_inputs=50,
-            num_critic_inputs=334,
+            num_actor_inputs=49,
+            num_critic_inputs=336,
             num_joints=17,
             min_std=0.01,
             max_std=1.0,
@@ -557,16 +553,13 @@ def run_actor(
         # Phase is required in order to follow the gait command.
         gait_phase_1 = sgj_cmd.gait.phase[..., None]
 
-        base_height_1 = commands["base_height_command"][..., None]
-
         obs_n = jnp.concatenate(
             [
                 dh_joint_pos_j,  # NUM_JOINTS
                 dh_joint_vel_j / 10.0,  # NUM_JOINTS
                 proj_grav_3,  # 3
                 imu_gyro_3,  # 3
                 gait_phase_1,  # 1
-                base_height_1,  # 1
                 joystick_cmd_ohe_8,  # 8
             ],
             axis=-1,
@@ -597,14 +590,13 @@ def run_critic(
         # Sinusoidal gait joystick command.
         sgj_cmd: ksim.EasyJoystickCommandValue = commands["easy_joystick_command"]
         joystick_cmd_ohe_8 = sgj_cmd.joystick.command
+        joystick_vel_tgts_3 = sgj_cmd.joystick.vels
 
         # Foot height difference.
         foot_height_2 = observations["feet_position_observation"][..., 2]
         foot_tgt_height_2 = sgj_cmd.gait.height
         foot_height_diff_2 = foot_height_2 - foot_tgt_height_2
 
-        base_height_1 = commands["base_height_command"][..., None]
-
         obs_n = jnp.concatenate(
             [
                 dh_joint_pos_j,  # NUM_JOINTS
@@ -618,8 +610,8 @@ def run_critic(
                 lin_vel_obs_3,  # 3
                 ang_vel_obs_3,  # 3
                 foot_height_diff_2,  # 2
-                base_height_1,  # 1
                 joystick_cmd_ohe_8,  # 8
+                joystick_vel_tgts_3,  # 3
             ],
             axis=-1,
         )
diff --git a/ksim/commands.py b/ksim/commands.py
@@ -274,22 +274,13 @@ def _update_for(self, cmd: JoystickCommandValue, trajectory: Trajectory) -> None
         self.rgba = (r, g, b, 1.0)
 
         cmd_x, cmd_y = cmd_vel[..., 0], cmd_vel[..., 1]
-
-        # Gets the robot's current yaw.
-        quat = trajectory.qpos[..., 3:7]
-        cur_yaw = xax.quat_to_yaw(quat)
-
-        # Rotates the command X and Y velocities to the robot's current yaw.
-        cmd_x_rot = cmd_x * jnp.cos(cur_yaw) - cmd_y * jnp.sin(cur_yaw)
-        cmd_y_rot = cmd_x * jnp.sin(cur_yaw) + cmd_y * jnp.cos(cur_yaw)
-
         self.pos = (0, 0, self.height)
 
         match cmd_idx:
             case 0:
                 self._update_circle()
             case 1 | 2 | 3 | 6 | 7:
-                self._update_arrow(cmd_x_rot.item(), cmd_y_rot.item())
+                self._update_arrow(cmd_x.item(), cmd_y.item())
             case 4 | 5:
                 self._update_cylinder()
             case _:
@@ -359,7 +350,7 @@ class JoystickCommand(Command):
     marker_z_offset: float = attrs.field(default=0.5)
     switch_prob: float = attrs.field(default=0.005)
 
-    def _get_vel_tgts(self, physics_data: PhysicsData, command: Array) -> Array:
+    def _get_vel_tgts(self, command: Array) -> Array:
         # Gets the target X, Y, and Yaw targets.
         cmd_tgts = jnp.array(
             [
@@ -385,9 +376,9 @@ def initial_command(
         curriculum_level: Array,
         rng: PRNGKeyArray,
     ) -> JoystickCommandValue:
-        command = jax.random.choice(rng, jnp.arange(len(self.sample_probs)), p=jnp.array(self.sample_probs))
-        command_ohe = jax.nn.one_hot(command, num_classes=8)
-        vel_tgts = self._get_vel_tgts(physics_data, command)
+        command = jax.random.choice(rng, len(self.sample_probs), p=jnp.array(self.sample_probs))
+        command_ohe = jax.nn.one_hot(command, num_classes=len(self.sample_probs))
+        vel_tgts = self._get_vel_tgts(command)
         return JoystickCommandValue(
             command=command_ohe,
             vels=vel_tgts,
diff --git a/ksim/rewards.py b/ksim/rewards.py
@@ -42,7 +42,7 @@
 import functools
 import logging
 from abc import ABC, abstractmethod
-from typing import Collection, Literal, Self, final
+from typing import Collection, Literal, Mapping, Self, final
 
 import attrs
 import chex
@@ -52,7 +52,6 @@
 from jaxtyping import Array, PRNGKeyArray, PyTree
 
 from ksim.commands import EasyJoystickCommandValue, JoystickCommandValue, SinusoidalGaitCommandValue
-from ksim.debugging import JitLevel
 from ksim.types import PhysicsModel, Trajectory
 from ksim.utils.mujoco import get_body_data_idx_from_name, get_qpos_data_idxs_by_name
 from ksim.utils.validators import (
@@ -117,7 +116,7 @@ class Reward(ABC):
     scale_by_curriculum: bool = attrs.field(default=False)
 
     @abstractmethod
-    def get_reward(self, trajectory: Trajectory) -> Array:
+    def get_reward(self, trajectory: Trajectory) -> Array | Mapping[str, Array]:
         """Get the reward for a single trajectory.
 
         Args:
@@ -158,7 +157,11 @@ def initial_carry(self, rng: PRNGKeyArray) -> PyTree:
         """
 
     @abstractmethod
-    def get_reward_stateful(self, trajectory: Trajectory, reward_carry: PyTree) -> tuple[Array, PyTree]:
+    def get_reward_stateful(
+        self,
+        trajectory: Trajectory,
+        reward_carry: PyTree,
+    ) -> tuple[Array | Mapping[str, Array], PyTree]:
         """Get the reward for a single trajectory.
 
         This is the same as `get_reward`, but it also takes in the reward carry
@@ -718,9 +721,11 @@ def _update_arrow(self, cmd_x: float, cmd_y: float) -> None:
 
     def update(self, trajectory: Trajectory) -> None:
         """Visualizes the joystick command target position and orientation."""
-        cur_xvel, cur_yvel = trajectory.qvel[..., 0].item(), trajectory.qvel[..., 1].item()
+        quat = JoystickReward.get_quat(trajectory)
+        linvel = trajectory.qvel[..., :3]
+        linvel = xax.rotate_vector_by_quat(linvel, quat, inverse=True)
         self.pos = (0, 0, self.height)
-        self._update_arrow(cur_xvel, cur_yvel)
+        self._update_arrow(linvel[..., 0].item(), linvel[..., 1].item())
 
     @classmethod
     def get(
@@ -752,42 +757,55 @@ class JoystickReward(Reward):
     """Reward for following the joystick command."""
 
     command_name: str = attrs.field(default="joystick_command")
-    ang_penalty_ratio: float = attrs.field(default=2.0)
+    dir_scale: float = attrs.field(default=1.0)
+    mag_scale: float = attrs.field(default=1.0)
+    yaw_scale: float = attrs.field(default=1.0)
 
-    @xax.jit(static_argnames=["self"], jit_level=JitLevel.UNROLL)
-    def get_reward(self, trajectory: Trajectory) -> Array:
+    def get_reward(self, trajectory: Trajectory) -> dict[str, Array]:
         if self.command_name not in trajectory.command:
             raise ValueError(f"Command {self.command_name} not found! Ensure that it is in the task.")
         return self._get_reward_for(trajectory.command[self.command_name], trajectory)
 
-    def _get_reward_for(self, joystick_cmd: JoystickCommandValue, trajectory: Trajectory) -> Array:
+    @classmethod
+    def get_quat(cls, trajectory: Trajectory) -> Array:
+        quat = trajectory.qpos[..., 3:7]
+        yaw = xax.quat_to_yaw(quat)
+        zeros = jnp.zeros_like(yaw)
+        euler = jnp.stack([zeros, zeros, yaw], axis=-1)
+        quat = xax.euler_to_quat(euler)
+        return quat
+
+    def _get_reward_for(self, joystick_cmd: JoystickCommandValue, trajectory: Trajectory) -> dict[str, Array]:
         # Gets the target X, Y, and Yaw velocities.
         tgts = joystick_cmd.vels
 
-        # Smooths the target velocities.
-        trg_xvel, trg_yvel, trg_yawvel = tgts.T
-
         # Gets the robot's current velocities.
-        cur_xvel = trajectory.qvel[..., 0]
-        cur_yvel = trajectory.qvel[..., 1]
-        cur_yawvel = trajectory.qvel[..., 5]
-
-        # Gets the robot's current yaw.
-        quat = trajectory.qpos[..., 3:7]
-        cur_yaw = xax.quat_to_yaw(quat)
-
-        # Rotates the command X and Y velocities to the robot's current yaw.
-        trg_xvel_rot = trg_xvel * jnp.cos(cur_yaw) - trg_yvel * jnp.sin(cur_yaw)
-        trg_yvel_rot = trg_xvel * jnp.sin(cur_yaw) + trg_yvel * jnp.cos(cur_yaw)
-
-        # Linear reward for tracking the target velocities.
-        pos_x_rew = -jnp.abs(trg_xvel_rot - cur_xvel)
-        pos_y_rew = -jnp.abs(trg_yvel_rot - cur_yvel)
-        rot_z_rew = -jnp.abs(trg_yawvel - cur_yawvel)
-
-        reward = (pos_x_rew + pos_y_rew + rot_z_rew) / 3.0
-
-        return reward
+        quat = self.get_quat(trajectory)
+        linvel = trajectory.qvel[..., :3]
+        linvel = xax.rotate_vector_by_quat(linvel, quat, inverse=True)
+        yawvel = trajectory.qvel[..., 5]
+
+        # Reward for tracking the direction (cosine similarity).
+        cur_xy = linvel[..., :2]
+        trg_xy = tgts[..., :2]
+        cur_norm = jnp.linalg.norm(cur_xy, axis=-1)
+        trg_norm = jnp.linalg.norm(trg_xy, axis=-1)
+        denom_xy = cur_norm * trg_norm
+        xy_cos_sim = (cur_xy * trg_xy).sum(axis=-1) / denom_xy.clip(min=1e-6)
+
+        # Reward for tracking the magnitude, in the direction of the target.
+        xy_mag_rew = 1.0 - jnp.where(trg_norm < 1e-6, cur_norm, jnp.abs(cur_norm - trg_norm) / trg_norm.clip(min=1e-6))
+
+        # Reward for tracking the yaw.
+        cur_yaw = yawvel
+        trg_yaw = tgts[..., 2]
+        yaw_mag_rew = 1.0 - jnp.abs(cur_yaw - trg_yaw)
+
+        return {
+            "dir": xy_cos_sim * self.dir_scale,
+            "mag": xy_mag_rew * self.mag_scale,
+            "yaw": yaw_mag_rew * self.yaw_scale,
+        }
 
     def get_markers(self) -> Collection[Marker]:
         return [JoystickRewardMarker.get()]
@@ -1054,19 +1072,25 @@ class EasyJoystickReward(StatefulReward):
     def initial_carry(self, rng: PRNGKeyArray) -> Array:
         return self.airtime.initial_carry(rng)
 
-    def get_reward_stateful(self, trajectory: Trajectory, reward_carry: Array) -> tuple[Array, Array]:
+    def get_reward_stateful(self, trajectory: Trajectory, reward_carry: Array) -> tuple[dict[str, Array], Array]:
         if self.command_name not in trajectory.command:
             raise ValueError(f"Command {self.command_name} not found! Ensure that it is in the task.")
 
         cmd: EasyJoystickCommandValue = trajectory.command[self.command_name]
-        joystick_reward = self.joystick._get_reward_for(cmd.joystick, trajectory) * self.joystick.scale
-        gait_reward = self.gait._get_reward_for(cmd.gait, trajectory) * self.gait.scale
+        joystick_reward = self.joystick._get_reward_for(cmd.joystick, trajectory)
+        gait_reward = self.gait._get_reward_for(cmd.gait, trajectory)
         airtime_reward, airtime_carry = self.airtime.get_reward_stateful(trajectory, reward_carry)
 
         # Mask out airtime reward when the robot is not moving.
         airtime_reward = jnp.where(cmd.joystick.command.argmax(axis=-1) == 0, 0.0, airtime_reward)
 
-        total_reward = joystick_reward + gait_reward + airtime_reward * self.airtime.scale
+        total_reward = {
+            "gait": gait_reward * self.gait.scale,
+            "airtime": airtime_reward * self.airtime.scale,
+        }
+        for k, v in joystick_reward.items():
+            total_reward[f"joystick/{k}"] = v * self.joystick.scale
+
         return total_reward, airtime_carry
 
     def get_markers(self) -> Collection[Marker]:
diff --git a/ksim/task/rl.py b/ksim/task/rl.py
@@ -26,7 +26,7 @@
 from pathlib import Path
 from threading import Thread
 from types import FrameType
-from typing import Any, Callable, Collection, Dict, Generic, TypeVar, cast
+from typing import Any, Callable, Collection, Dict, Generic, Mapping, TypeVar, cast
 
 import chex
 import equinox as eqx
@@ -210,14 +210,18 @@ def get_rewards(
             reward_val, reward_carry = reward.get_reward_stateful(trajectory, reward_carry)
         else:
             reward_val = reward.get_reward(trajectory)
-        reward_val = reward_val * reward.scale
+        if isinstance(reward_val, Mapping):
+            reward_val = {f"{reward_name}/{k}": v * reward.scale for k, v in reward_val.items()}
+        else:
+            reward_val = {reward_name: reward_val * reward.scale}
         if reward.scale_by_curriculum:
-            reward_val = reward_val * curriculum_level
+            reward_val = {k: v * curriculum_level for k, v in reward_val.items()}
 
-        if reward_val.shape != trajectory.done.shape:
-            raise AssertionError(f"Reward {reward_name} shape {reward_val.shape} does not match {target_shape}")
+        for k, v in reward_val.items():
+            if v.shape != trajectory.done.shape:
+                raise AssertionError(f"Reward {k} shape {v.shape} does not match {target_shape}")
 
-        reward_dict[reward_name] = reward_val
+        reward_dict.update(reward_val)
         next_reward_carry[reward_name] = reward_carry
 
     total_reward = jax.tree.reduce(jnp.add, list(reward_dict.values()))