revive feet height reward (#506)

codekansas · web-flow · commit 76764ad9ffdf · 2025-08-17T02:20:32.000-07:00
* revive feet height reward

* better scale
diff --git a/examples/walking.py b/examples/walking.py
@@ -519,19 +519,25 @@ def get_rewards(self, physics_model: ksim.PhysicsModel) -> dict[str, ksim.Reward
             ),
             "foot_airtime": ksim.FeetAirTimeReward(
                 ctrl_dt=self.config.ctrl_dt,
-                threshold=self.config.gait_period / 2.0,
+                period=self.config.gait_period / 2.0,
+                contact_obs="feet_contact",
+                scale=1.0,
+            ),
+            "foot_height": ksim.FeetHeightReward(
+                ctrl_dt=self.config.ctrl_dt,
+                period=self.config.gait_period / 2.0,
                 contact_obs="feet_contact",
                 position_obs="feet_position",
                 height=self.config.max_foot_height,
                 scale=1.0,
             ),
             "foot_force": ksim.FeetForcePenalty(
                 force_obs="feet_force",
-                scale=-0.1,
+                scale=-1e-6,
             ),
             "foot_torque": ksim.FeetTorquePenalty(
                 torque_obs="feet_torque",
-                scale=-0.1,
+                scale=-1e-7,
             ),
         }
 
diff --git a/ksim/rewards.py b/ksim/rewards.py
@@ -27,6 +27,7 @@
     "LinkJerkPenalty",
     "ReachabilityPenalty",
     "FeetAirTimeReward",
+    "FeetHeightReward",
     "FeetForcePenalty",
     "FeetTorquePenalty",
     "SinusoidalGaitReward",
@@ -47,11 +48,7 @@
 from ksim.commands import AngularVelocityCommandValue, LinearVelocityCommandValue, SinusoidalGaitCommandValue
 from ksim.types import PhysicsModel, Trajectory
 from ksim.utils.mujoco import get_body_data_idx_from_name, get_qpos_data_idxs_by_name
-from ksim.utils.validators import (
-    CartesianIndex,
-    cartesian_index_to_dim,
-    norm_validator,
-)
+from ksim.utils.validators import CartesianIndex, cartesian_index_to_dim, norm_validator
 from ksim.vis import Marker
 
 logger = logging.getLogger(__name__)
@@ -729,27 +726,22 @@ def get_reward(self, traj: Trajectory) -> jnp.ndarray:
 class FeetAirTimeReward(StatefulReward):
     """Reward for feet either touching or not touching the ground for some time."""
 
-    threshold: float = attrs.field()
+    period: float = attrs.field()
     ctrl_dt: float = attrs.field()
     contact_obs: str = attrs.field()
-    position_obs: str = attrs.field()
-    height: float = attrs.field()
     num_feet: int = attrs.field(default=2)
     bias: float = attrs.field(default=0.0)
     linvel_moving_threshold: float = attrs.field(default=0.05)
     angvel_moving_threshold: float = attrs.field(default=0.05)
 
-    def initial_carry(self, rng: PRNGKeyArray) -> tuple[Array, Array]:
-        return (
-            jnp.zeros(self.num_feet, dtype=jnp.int32),
-            jnp.zeros(self.num_feet, dtype=jnp.float32),
-        )
+    def initial_carry(self, rng: PRNGKeyArray) -> Array:
+        return jnp.zeros(self.num_feet, dtype=jnp.int32)
 
     def get_reward_stateful(
         self,
         trajectory: Trajectory,
-        reward_carry: tuple[Array, Array],
-    ) -> tuple[Array, tuple[Array, Array]]:
+        reward_carry: Array,
+    ) -> tuple[Array, Array]:
         not_moving_lin = jnp.linalg.norm(trajectory.qvel[..., :2], axis=-1) < self.linvel_moving_threshold
         not_moving_ang = trajectory.qvel[..., 5] < self.angvel_moving_threshold
         not_moving = not_moving_lin & not_moving_ang
@@ -758,39 +750,77 @@ def get_reward_stateful(
         sensor_data_tn = sensor_data_tcn.any(axis=-2)
         chex.assert_shape(sensor_data_tn, (..., self.num_feet))
 
-        position_tn3 = trajectory.obs[self.position_obs]
-        chex.assert_shape(position_tn3, (..., self.num_feet, 3))
-
-        threshold_steps = round(self.threshold / self.ctrl_dt)
+        threshold_steps = round(self.period / self.ctrl_dt)
 
         def scan_fn(
-            carry: tuple[Array, Array],
-            x: tuple[Array, Array, Array, Array],
-        ) -> tuple[tuple[Array, Array], tuple[Array, Array]]:
-            (count_n, max_height_n), (contact_n, position_n3, not_moving, done) = carry, x
+            carry: Array,
+            x: tuple[Array, Array, Array],
+        ) -> tuple[Array, Array]:
+            count_n, (contact_n, not_moving, done) = carry, x
             reset = done | not_moving | contact_n
             count_n = jnp.where(reset, 0, count_n + 1)
+            return count_n, count_n
 
-            height_n = position_n3[..., 2]
-            max_height_n = jnp.where(reset, 0.0, jnp.maximum(max_height_n, height_n))
-
-            return (count_n, max_height_n), (count_n, max_height_n)
-
-        reward_carry, (count_tn, max_height_tn) = xax.scan(
+        reward_carry, count_tn = xax.scan(
             scan_fn,
             reward_carry,
-            (sensor_data_tn, position_tn3, not_moving, trajectory.done),
+            (sensor_data_tn, not_moving, trajectory.done),
         )
 
         # Gradually increase reward until `threshold_steps`.
         reward_tn = (count_tn.astype(jnp.float32) / threshold_steps) + self.bias
         reward_tn = jnp.where((count_tn > 0) & (count_tn < threshold_steps), reward_tn, 0.0)
+        reward_t = reward_tn.sum(axis=-1)
+        return reward_t, reward_carry
 
-        # Scale the reward according to the max height.
-        reward_tn = reward_tn * max_height_tn.clip(max=self.height) / self.height
 
-        reward_t = reward_tn.sum(axis=-1)
+@attrs.define(frozen=True, kw_only=True)
+class FeetHeightReward(StatefulReward):
+    """Reward for feet either touching or not touching the ground for some time."""
+
+    period: float = attrs.field()
+    ctrl_dt: float = attrs.field()
+    contact_obs: str = attrs.field()
+    position_obs: str = attrs.field()
+    height: float = attrs.field()
+    num_feet: int = attrs.field(default=2)
+    bias: float = attrs.field(default=0.0)
+    linvel_moving_threshold: float = attrs.field(default=0.05)
+    angvel_moving_threshold: float = attrs.field(default=0.05)
+
+    def initial_carry(self, rng: PRNGKeyArray) -> tuple[Array, Array]:
+        return (
+            jnp.zeros(self.num_feet, dtype=jnp.float32),
+            jnp.zeros(self.num_feet, dtype=jnp.float32),
+        )
 
+    def get_reward_stateful(
+        self,
+        trajectory: Trajectory,
+        reward_carry: tuple[Array, Array],
+    ) -> tuple[Array, tuple[Array, Array]]:
+        contact_tcn = trajectory.obs[self.contact_obs] > 0.5  # Values are either 0 or 1.
+        contact_tn = contact_tcn.any(axis=-2)
+        chex.assert_shape(contact_tn, (..., self.num_feet))
+
+        position_tn3 = trajectory.obs[self.position_obs]
+        chex.assert_shape(position_tn3, (..., self.num_feet, 3))
+
+        # Give a sparse reward once the foot contacts the ground, equal to the
+        # maximum height of the foot since the last contact, thresholded at the
+        # target height.
+        def scan_fn(carry: tuple[Array, Array], x: tuple[Array, Array]) -> tuple[tuple[Array, Array], Array]:
+            (elapsed_time_n, max_height_n), (contact_n, position_n3) = carry, x
+            height_n = position_n3[..., 2]
+            scale = (elapsed_time_n / self.period).clip(max=1.0)
+            reward_n = jnp.where(contact_n, max_height_n, 0.0).clip(max=self.height) * scale
+            max_height_n = jnp.maximum(max_height_n, height_n)
+            max_height_n = jnp.where(contact_n, 0.0, max_height_n)
+            elapsed_time_n = jnp.where(contact_n, 0.0, elapsed_time_n + self.ctrl_dt)
+            return (elapsed_time_n, max_height_n), reward_n
+
+        reward_carry, reward_tn = xax.scan(scan_fn, reward_carry, (contact_tn, position_tn3))
+        reward_t = reward_tn.max(axis=-1)
         return reward_t, reward_carry