Adds NAN check to avoid ambiguous std >= 0.0 error (#185)

ClemensSchwarke · web-flow · commit d354d599a56e · 2026-02-28T14:18:21.000+01:00
diff --git a/docs/guide/configuration.rst b/docs/guide/configuration.rst
@@ -52,11 +52,7 @@ Currently, RSL-RL implements two runner classes:
    * - ``obs_groups``
      - dict[str, list[str]]
      - required
-     - Mapping from observation sets to observation tensors coming from the environment.
-   * - ``run_name``
-     - str
-     - missing
-     - Optional run label shown in the console output.
+     - Mapping from observation sets to observation groups coming from the environment. See :ref:`here <observation-configuration>` for more details.
    * - ``save_interval``
      - int
      - required
@@ -73,6 +69,14 @@ Currently, RSL-RL implements two runner classes:
      - str
      - required for Neptune
      - Neptune project name used by the Neptune writer.
+   * - ``run_name``
+     - str
+     - missing
+     - Optional run label shown in the console output.
+   * - ``check_for_nan``
+     - bool
+     - ``True``
+     - Whether to check for NaN values coming from the environment.
    * - ``algorithm``
      - dict
      - required
diff --git a/rsl_rl/algorithms/ppo.py b/rsl_rl/algorithms/ppo.py
@@ -231,7 +231,7 @@ def update(self) -> dict[str, float]:
             # Check if we should normalize advantages per mini batch
             if self.normalize_advantage_per_mini_batch:
                 with torch.no_grad():
-                    batch.advantages = (batch.advantages - batch.advantages.mean()) / (batch.advantages.std() + 1e-8)
+                    batch.advantages = (batch.advantages - batch.advantages.mean()) / (batch.advantages.std() + 1e-8)  # type: ignore
 
             # Perform symmetric augmentation
             if self.symmetry and self.symmetry["use_data_augmentation"]:
@@ -259,7 +259,7 @@ def update(self) -> dict[str, float]:
                 hidden_state=batch.hidden_states[0],
                 stochastic_output=True,
             )
-            actions_log_prob = self.actor.get_output_log_prob(batch.actions)
+            actions_log_prob = self.actor.get_output_log_prob(batch.actions)  # type: ignore
             values = self.critic(batch.observations, masks=batch.masks, hidden_state=batch.hidden_states[1])
             # Note: We only keep the distribution parameters and entropy of the first augmentation (the original one)
             distribution_params = tuple(p[:original_batch_size] for p in self.actor.output_distribution_params)
@@ -268,7 +268,7 @@ def update(self) -> dict[str, float]:
             # Compute KL divergence and adapt the learning rate
             if self.desired_kl is not None and self.schedule == "adaptive":
                 with torch.inference_mode():
-                    kl = self.actor.get_kl_divergence(batch.old_distribution_params, distribution_params)
+                    kl = self.actor.get_kl_divergence(batch.old_distribution_params, distribution_params)  # type: ignore
                     kl_mean = torch.mean(kl)
 
                     # Reduce the KL divergence across all GPUs
@@ -294,9 +294,9 @@ def update(self) -> dict[str, float]:
                         param_group["lr"] = self.learning_rate
 
             # Surrogate loss
-            ratio = torch.exp(actions_log_prob - torch.squeeze(batch.old_actions_log_prob))
-            surrogate = -torch.squeeze(batch.advantages) * ratio
-            surrogate_clipped = -torch.squeeze(batch.advantages) * torch.clamp(
+            ratio = torch.exp(actions_log_prob - torch.squeeze(batch.old_actions_log_prob))  # type: ignore
+            surrogate = -torch.squeeze(batch.advantages) * ratio  # type: ignore
+            surrogate_clipped = -torch.squeeze(batch.advantages) * torch.clamp(  # type: ignore
                 ratio, 1.0 - self.clip_param, 1.0 + self.clip_param
             )
             surrogate_loss = torch.max(surrogate, surrogate_clipped).mean()
diff --git a/rsl_rl/runners/on_policy_runner.py b/rsl_rl/runners/on_policy_runner.py
@@ -13,7 +13,7 @@
 from rsl_rl.algorithms import PPO
 from rsl_rl.env import VecEnv
 from rsl_rl.models import MLPModel
-from rsl_rl.utils import resolve_callable
+from rsl_rl.utils import check_nan, resolve_callable
 from rsl_rl.utils.logger import Logger
 
 
@@ -85,6 +85,9 @@ def learn(self, num_learning_iterations: int, init_at_random_ep_len: bool = Fals
                     actions = self.alg.act(obs)
                     # Step the environment
                     obs, rewards, dones, extras = self.env.step(actions.to(self.env.device))
+                    # Check for NaN values from the environment
+                    if self.cfg.get("check_for_nan", True):
+                        check_nan(obs, rewards, dones)
                     # Move to device
                     obs, rewards, dones = (obs.to(self.device), rewards.to(self.device), dones.to(self.device))
                     # Process the step
diff --git a/rsl_rl/storage/rollout_storage.py b/rsl_rl/storage/rollout_storage.py
@@ -213,7 +213,7 @@ def generator(self) -> Generator[Batch, None, None]:
 
         for i in range(self.num_transitions_per_env):
             yield RolloutStorage.Batch(
-                observations=self.observations[i],
+                observations=self.observations[i],  # type: ignore
                 privileged_actions=self.privileged_actions[i],
                 dones=self.dones[i],
             )
@@ -312,14 +312,14 @@ def recurrent_mini_batch_generator(
 
                 # Yield the mini-batch
                 yield RolloutStorage.Batch(
-                    observations=padded_obs_trajectories[:, first_traj:last_traj],
+                    observations=padded_obs_trajectories[:, first_traj:last_traj],  # type: ignore
                     actions=self.actions[:, start:stop],
                     values=self.values[:, start:stop],
                     advantages=self.advantages[:, start:stop],
                     returns=self.returns[:, start:stop],
                     old_actions_log_prob=self.actions_log_prob[:, start:stop],
                     old_distribution_params=tuple(p[:, start:stop] for p in self.distribution_params),  # type: ignore
-                    hidden_states=(hidden_state_a_batch, hidden_state_c_batch),
+                    hidden_states=(hidden_state_a_batch, hidden_state_c_batch),  # type: ignore
                     masks=trajectory_masks[:, first_traj:last_traj],
                 )
 
diff --git a/rsl_rl/utils/__init__.py b/rsl_rl/utils/__init__.py
@@ -6,6 +6,7 @@
 """Helper functions."""
 
 from .utils import (
+    check_nan,
     get_param,
     resolve_callable,
     resolve_nn_activation,
@@ -16,6 +17,7 @@
 )
 
 __all__ = [
+    "check_nan",
     "get_param",
     "resolve_callable",
     "resolve_nn_activation",
diff --git a/rsl_rl/utils/utils.py b/rsl_rl/utils/utils.py
@@ -272,6 +272,26 @@ def resolve_obs_groups(
     return obs_groups
 
 
+def check_nan(obs: TensorDict, rewards: torch.Tensor, dones: torch.Tensor) -> None:
+    """Raise ``ValueError`` if any environment output contains NaN."""
+    for key, tensor in obs.items():
+        if torch.isnan(tensor).any():
+            raise ValueError(
+                f"The observation group '{key}' returned by the environment contains NaN values. This usually indicates"
+                " a bug in the environment's step() or reset() function."
+            )
+    if torch.isnan(rewards).any():
+        raise ValueError(
+            "The rewards returned by the environment contain NaN values. This usually indicates a bug in the"
+            " environment's reward computation."
+        )
+    if torch.isnan(dones).any():
+        raise ValueError(
+            "The dones returned by the environment contain NaN values. This usually indicates a bug in the"
+            " environment's termination logic."
+        )
+
+
 def split_and_pad_trajectories(
     tensor: torch.Tensor | TensorDict, dones: torch.Tensor
 ) -> tuple[torch.Tensor | TensorDict, torch.Tensor]: