Skip to content

Commit 645c678

Browse files
authored
Fix recorder dataset filename collision in shared /tmp (#469)
## Summary - Always append a timestamp and rank to the recorder dataset filename - Previously this was only done in distributed mode (to avoid HDF5 file lock conflicts between ranks); now it also prevents permission errors when multiple processes or repeated runs share `/tmp` in the container - Non-distributed runs just get a `_rank0` suffix `/tmp` is mounted and shared across container processes. All runs write to `/tmp/isaaclab/logs/dataset.hdf5`. If the file was created by a different process/user, subsequent runs get `PermissionError: [Errno 13] Unable to synchronously create file`. Signed-off-by: Clemens Volk <cvolk@nvidia.com>
1 parent dcc7e03 commit 645c678

File tree

1 file changed

+8
-9
lines changed

1 file changed

+8
-9
lines changed

isaaclab_arena/environments/arena_env_builder.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from __future__ import annotations
77

88
import argparse
9+
import datetime
910
import gymnasium as gym
1011

1112
from isaaclab.envs import ManagerBasedRLMimicEnv
@@ -91,13 +92,12 @@ def _solve_relations(self) -> None:
9192
else:
9293
print(f"Relation solving not completed after {result.attempts} attempt(s)")
9394

94-
def _modify_recorder_cfg_for_distributed(self, recorder_cfg: RecorderManagerBaseCfg) -> RecorderManagerBaseCfg:
95-
"""Modify the recorder dataset filename for distributed multi-gpu envs.
96-
This is to avoid HDF5 file lock conflict when distributed: each rank uses a unique dataset filename.
97-
"""
98-
if getattr(self.args, "distributed", False):
99-
base = getattr(recorder_cfg, "dataset_filename", "dataset")
100-
recorder_cfg.dataset_filename = f"{base}_rank{get_local_rank()}"
95+
def _modify_recorder_cfg_dataset_filename(self, recorder_cfg: RecorderManagerBaseCfg) -> RecorderManagerBaseCfg:
96+
"""Modify the recorder dataset filename to include the timestamp and rank."""
97+
base = getattr(recorder_cfg, "dataset_filename", "dataset")
98+
recorder_cfg.dataset_filename = (
99+
f"{base}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}_rank{get_local_rank()}"
100+
)
101101
return recorder_cfg
102102

103103
# This method gives the arena environment a chance to modify the environment configuration.
@@ -168,8 +168,7 @@ def compose_manager_cfg(self) -> IsaacLabArenaManagerBasedRLEnvCfg:
168168
embodiment.get_recorder_term_cfg(),
169169
bases=(RecorderManagerBaseCfg,),
170170
)
171-
# Only modify the recorder configuration for distributed multi-gpu envs.
172-
recorder_manager_cfg = self._modify_recorder_cfg_for_distributed(recorder_manager_cfg)
171+
recorder_manager_cfg = self._modify_recorder_cfg_dataset_filename(recorder_manager_cfg)
173172

174173
rewards_cfg = combine_configclass_instances(
175174
"RewardsCfg",

0 commit comments

Comments
 (0)