Enhances Pbt usage experience through small improvements (#3449)

ooctipus · kellyguo11 · web-flow · commit 90dda53f7c08 · 2025-09-19T15:11:15.000-07:00
# Description This PR is added with feedback from PBT user, and made below improvments 1. added resume logic to allow wandb to continue on the same run_id 2. corrected broadcasting order in distributed setup 3. made score query general by using dotted keys to access dictionary of arbitrary depth Fixes # (issue) - Bug fix (non-breaking change which fixes an issue) ## Screenshots Please attach before and after screenshots of the change if applicable.  ## Checklist - [x] I have read and understood the [contribution guidelines](https://isaac-sim.github.io/IsaacLab/main/source/refs/contributing.html) - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [] I have added tests that prove my fix is effective or that my feature works - [x] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there  --------- Co-authored-by: Kelly Guo <kellyg@nvidia.com>
diff --git a/docs/source/features/population_based_training.rst b/docs/source/features/population_based_training.rst
@@ -49,7 +49,7 @@ Example Config
      num_policies: 8
      directory: .
      workspace: "pbt_workspace"
-     objective: Curriculum/difficulty_level
+     objective: episode.Curriculum/difficulty_level
      interval_steps: 50000000
      threshold_std: 0.1
      threshold_abs: 0.025
@@ -66,9 +66,9 @@ Example Config
        agent.params.config.tau: "mutate_discount"
 
 
-``objective: Curriculum/difficulty_level`` uses ``infos["episode"]["Curriculum/difficulty_level"]`` as the scalar to
-**rank policies** (higher is better).  With ``num_policies: 8``, launch eight processes sharing the same ``workspace``
-and unique ``policy_idx`` (0-7).
+``objective: episode.Curriculum/difficulty_level`` is the dotted expression that uses
+``infos["episode"]["Curriculum/difficulty_level"]`` as the scalar to **rank policies** (higher is better).
+With ``num_policies: 8``, launch eight processes sharing the same ``workspace`` and unique ``policy_idx`` (0-7).
 
 
 Launching PBT
diff --git a/scripts/reinforcement_learning/rl_games/train.py b/scripts/reinforcement_learning/rl_games/train.py
@@ -226,8 +226,9 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
             monitor_gym=True,
             save_code=True,
         )
-        wandb.config.update({"env_cfg": env_cfg.to_dict()})
-        wandb.config.update({"agent_cfg": agent_cfg})
+        if not wandb.run.resumed:
+            wandb.config.update({"env_cfg": env_cfg.to_dict()})
+            wandb.config.update({"agent_cfg": agent_cfg})
 
     if args_cli.checkpoint is not None:
         runner.run({"train": True, "play": False, "sigma": train_sigma, "checkpoint": resume_path})
diff --git a/source/isaaclab_rl/config/extension.toml b/source/isaaclab_rl/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.4.0"
+version = "0.4.1"
 
 # Description
 title = "Isaac Lab RL"
diff --git a/source/isaaclab_rl/docs/CHANGELOG.rst b/source/isaaclab_rl/docs/CHANGELOG.rst
@@ -1,6 +1,18 @@
 Changelog
 ---------
 
+0.4.1 (2025-09-09)
+~~~~~~~~~~~~~~~~~~
+
+Fixed
+^^^^^
+
+* Made PBT a bit nicer by
+* 1. added resume logic to allow wandb to continue on the same run_id
+* 2. corrected broadcasting order in distributed setup
+* 3. made score query general by using dotted keys to access dictionary of arbitrary depth
+
+
 0.4.0 (2025-09-09)
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/source/isaaclab_rl/isaaclab_rl/rl_games/pbt/pbt.py b/source/isaaclab_rl/isaaclab_rl/rl_games/pbt/pbt.py
@@ -68,9 +68,12 @@ def process_infos(self, infos, done_indices):
         """Extract the scalar objective from environment infos and store in `self.score`.
 
         Notes:
-            Expects the objective to be at `infos["episode"][self.cfg.objective]`.
+            Expects the objective to be at `infos[self.cfg.objective]` where self.cfg.objective is dotted address.
         """
-        self.score = infos["episode"][self.cfg.objective]
+        score = infos
+        for part in self.cfg.objective.split("."):
+            score = score[part]
+        self.score = score
 
     def after_steps(self):
         """Main PBT tick executed every train step.
@@ -84,6 +87,9 @@ def after_steps(self):
                whitelisted params, set `restart_flag`, broadcast (if distributed),
                and print a mutation diff table.
         """
+        if self.distributed_args.distributed:
+            dist.broadcast(self.restart_flag, src=0)
+
         if self.distributed_args.rank != 0:
             if self.restart_flag.cpu().item() == 1:
                 os._exit(0)
@@ -154,9 +160,6 @@ def after_steps(self):
             self.new_params = mutate(cur_params, self.cfg.mutation, self.cfg.mutation_rate, self.cfg.change_range)
             self.restart_from_checkpoint = os.path.abspath(ckpts[replacement_policy_candidate]["checkpoint"])
             self.restart_flag[0] = 1
-            if self.distributed_args.distributed:
-                dist.broadcast(self.restart_flag, src=0)
-
             self.printer.print_mutation_diff(cur_params, self.new_params)
 
     def _restart_with_new_params(self, new_params, restart_from_checkpoint):
@@ -191,6 +194,11 @@ def _restart_with_new_params(self, new_params, restart_from_checkpoint):
         if self.wandb_args.enabled:
             import wandb
 
+            # note setdefault will only affect child process, that mean don't have to worry it env variable
+            # propagate beyond restarted child process
+            os.environ.setdefault("WANDB_RUN_ID", wandb.run.id)  # continue with the same run id
+            os.environ.setdefault("WANDB_RESUME", "allow")  # allow wandb to resume
+            os.environ.setdefault("WANDB_INIT_TIMEOUT", "300")  # give wandb init more time to be fault tolerant
             wandb.run.finish()
 
         # Get the directory of the current file