fix: Fix policy worker placement when using unified placement group (#1341)

guyueh1 · coderabbitai[bot] · web-flow · commit d843f02e6dd4 · 2025-10-22T22:42:16.000-07:00
Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;
Signed-off-by: Guyue Huang &lt;140554423+guyueh1@users.noreply.github.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
@@ -248,6 +248,9 @@ def setup(
         init_optimizer=True,
         init_reference_model=True,
     )
+    # print the node IP and GPU ID of the policy workers for debugging
+    policy.print_node_ip_and_gpu_id()
+
     loss_fn = DPOLossFn(master_config["dpo"])
     print("  ✓ Model initialized")
 
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -482,6 +482,8 @@ def setup(
         optimizer_path=optimizer_path,
         init_optimizer=True,
     )
+    # print the node IP and GPU ID of the policy workers for debugging
+    policy.print_node_ip_and_gpu_id()
 
     # if it is not colocated inference, initialize collective communication for update weights
     if not colocated_inference:
diff --git a/nemo_rl/algorithms/rm.py b/nemo_rl/algorithms/rm.py
@@ -223,6 +223,9 @@ def setup(
         init_optimizer=True,
         init_reference_model=False,
     )
+    # print the node IP and GPU ID of the policy workers for debugging
+    policy.print_node_ip_and_gpu_id()
+
     loss_fn = PreferenceLoss()
     print("  ✓ Model initialized")
 
diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py
@@ -202,6 +202,9 @@ def setup(
         init_optimizer=True,
         init_reference_model=False,
     )
+    # print the node IP and GPU ID of the policy workers for debugging
+    policy.print_node_ip_and_gpu_id()
+
     loss_fn = NLLLoss()
     print("  ✓ Model initialized")
 
diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py
@@ -21,6 +21,7 @@
 from ray.util.placement_group import (
     PlacementGroup,
     placement_group,
+    placement_group_table,
     remove_placement_group,
 )
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -170,6 +171,14 @@ def init_ray(log_dir: Optional[str] = None) -> None:
     )
 
 
+@ray.remote(num_gpus=1)
+class GetGPUIDActor:  # pragma: no cover
+    """Util actor class to return GPU id of the current worker."""
+
+    def get_gpu_id(self):
+        return ray.get_gpu_ids()[0]
+
+
 class ResourceInsufficientError(Exception):
     """Exception raised when the cluster does not have enough resources to satisfy the requested configuration."""
 
@@ -210,6 +219,7 @@ def __init__(
         self._bundle_ct_per_node_list = bundle_ct_per_node_list
         self._world_size = sum(self._bundle_ct_per_node_list)
         self._node_placement_groups: Optional[list[PlacementGroup]] = None
+        self._sorted_bundle_indices: Optional[list[int]] = None
 
         self.num_gpus_per_node = num_gpus_per_node
         self.use_gpus = use_gpus
@@ -251,6 +261,8 @@ def _init_placement_groups(
                 self._node_placement_groups = self._create_placement_groups_internal(
                     strategy, use_unified_pg
                 )
+                if use_unified_pg and self.use_gpus:
+                    self._sorted_bundle_indices = self._get_sorted_bundle_indices()
                 return self._node_placement_groups
             except ResourceInsufficientError as e:
                 print(e)
@@ -402,8 +414,66 @@ def get_master_address_and_port(self) -> tuple[str, int]:
         Returns:
             Tuple of (address, port)
         """
+        # Get placement groups if not already created
+        if not self._node_placement_groups:
+            self.get_placement_groups()
+
+        # If sorted bundle indices are available, get the address and port for the first bundle index
+        if self._sorted_bundle_indices is not None:
+            return self.get_available_address_and_port(
+                pg_idx=0, bundle_idx=self._sorted_bundle_indices[0]
+            )
+
+        # Otherwise, get the address and port for bundle index 0
         return self.get_available_address_and_port(pg_idx=0, bundle_idx=0)
 
+    def _get_sorted_bundle_indices(self) -> Optional[list[int]]:
+        """Gets the sorted bundle indices for the placement groups."""
+        if self._node_placement_groups is None:
+            raise ValueError(
+                "Placement groups must be initialized before calling _get_sorted_bundle_indices"
+            )
+
+        if not self.use_gpus:
+            return None
+
+        if len(self._node_placement_groups) != 1:
+            return None
+
+        pg = self._node_placement_groups[0]
+        pg_data = placement_group_table(pg)
+        num_bundles = len(pg_data["bundles"])
+        bundle_to_node_ids = pg_data["bundles_to_node_id"]
+
+        # use info actor to get the GPU id
+        info_actors = []
+        for i in range(num_bundles):
+            info_actors.append(
+                GetGPUIDActor.options(
+                    num_cpus=0.01,  # set both num_cpus and num_gpus to be small values to enable assignment in colocated case
+                    num_gpus=0.01,
+                    resources=None,
+                    scheduling_strategy=PlacementGroupSchedulingStrategy(
+                        placement_group=pg,
+                        placement_group_bundle_index=i,
+                    ),
+                ).remote()
+            )
+
+        gpu_ids = ray.get([actor.get_gpu_id.remote() for actor in info_actors])
+        for actor in info_actors:
+            ray.kill(actor)
+
+        # original index, node_id, gpu_id
+        bundle_infos = [
+            (i, bundle_to_node_ids[i], gpu_ids[i]) for i in range(num_bundles)
+        ]
+        pg_reordered_bundle_indices = [
+            bundle_info[0]
+            for bundle_info in sorted(bundle_infos, key=lambda x: (x[1], x[2]))
+        ]  # sort by node_id, then gpu_id
+        return pg_reordered_bundle_indices
+
     def shutdown(self) -> bool:
         """Cleans up and releases all resources associated with this virtual cluster.
 
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -1929,3 +1929,9 @@ def start_gpu_profiling(self) -> None:
     def stop_gpu_profiling(self) -> None:
         """Stop GPU profiling."""
         torch.cuda.profiler.stop()
+
+    def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]:
+        """Report the node IP and GPU ID of the current worker."""
+        ip = ray._private.services.get_node_ip_address()
+        gpu_id = ray.get_gpu_ids()[0]
+        return (ip, gpu_id)
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -1913,3 +1913,9 @@ def start_gpu_profiling(self) -> None:
     def stop_gpu_profiling(self) -> None:
         """Stop GPU profiling."""
         torch.cuda.profiler.stop()
+
+    def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]:
+        """Report the node IP and GPU ID of the current worker."""
+        ip = ray._private.services.get_node_ip_address()
+        gpu_id = ray.get_gpu_ids()[0]
+        return (ip, gpu_id)
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
@@ -167,14 +167,33 @@ def __init__(
             pre_init_communication_queue=pre_init_queue,
         )
 
-        self.worker_group = RayWorkerGroup(
-            cluster,
-            worker_builder,
-            name_prefix=name_prefix,
-            workers_per_node=workers_per_node,
-            sharding_annotations=self.sharding_annotations,
-            env_vars=env_vars or {},
-        )
+        if cluster._sorted_bundle_indices is not None:
+            # The cluster has initialized a unified placemenet group across nodes
+            # In this case, we need to create workers based on sorted bundle indices
+            group_size = cluster.num_gpus_per_node
+            tied_groups = [
+                (i // group_size, [bundle_idx])
+                for i, bundle_idx in enumerate(cluster._sorted_bundle_indices)
+            ]
+
+            self.worker_group = RayWorkerGroup(
+                cluster,
+                worker_builder,
+                name_prefix=name_prefix,
+                bundle_indices_list=tied_groups,
+                sharding_annotations=self.sharding_annotations,
+                env_vars=env_vars or {},
+            )
+
+        else:
+            self.worker_group = RayWorkerGroup(
+                cluster,
+                worker_builder,
+                name_prefix=name_prefix,
+                workers_per_node=workers_per_node,
+                sharding_annotations=self.sharding_annotations,
+                env_vars=env_vars or {},
+            )
 
         if config["dynamic_batching"]["enabled"]:
             assert pp_size == 1, (
@@ -755,3 +774,36 @@ def stop_gpu_profiling(self) -> None:
         """Stop GPU profiling."""
         futures = self.worker_group.run_all_workers_single_data("stop_gpu_profiling")
         ray.get(futures)
+
+    def print_node_ip_and_gpu_id(self) -> list[tuple[str, int]]:
+        """Print the node IP and GPU ID of the current worker."""
+        results = ray.get(
+            self.worker_group.run_all_workers_single_data(
+                "report_node_ip_and_gpu_id",
+            )
+        )
+        all_node_ips = sorted(set([result[0] for result in results]))
+        all_gpu_ids = sorted(set([result[1] for result in results]))
+
+        worker_id_list = [
+            [list() for _ in range(len(all_gpu_ids))] for _ in range(len(all_node_ips))
+        ]
+        for worker_id, (ip, gpu_id) in enumerate(results):
+            node_idx = all_node_ips.index(ip)
+            gpu_idx = all_gpu_ids.index(gpu_id)
+            worker_id_list[node_idx][gpu_idx].append("worker-" + str(worker_id))
+
+        from prettytable import PrettyTable
+
+        table = PrettyTable()
+        table.title = "Policy worker mapping to Nodes and GPUs"
+        table.field_names = ["Node_IP"] + [
+            "GPU_ID=" + str(gpu_id) for gpu_id in all_gpu_ids
+        ]
+        for i, node_idx in enumerate(all_node_ips):
+            row = [node_idx]
+            for j in range(len(all_gpu_ids)):
+                row.append(tuple(worker_id_list[i][j]))
+            table.add_row(row)
+
+        print(table)
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -2228,3 +2228,9 @@ def start_gpu_profiling(self) -> None:
     def stop_gpu_profiling(self) -> None:
         """Stop GPU profiling."""
         torch.cuda.profiler.stop()
+
+    def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]:
+        """Report the node IP and GPU ID of the current worker."""
+        ip = ray._private.services.get_node_ip_address()
+        gpu_id = ray.get_gpu_ids()[0]
+        return (ip, gpu_id)
diff --git a/tests/unit/distributed/test_virtual_cluster.py b/tests/unit/distributed/test_virtual_cluster.py
@@ -229,3 +229,20 @@ def test_mcore_py_executable():
             assert "megatron-bridge is imported" in result.stdout
             assert "megatron-core is imported" in result.stdout
             assert "megatron-training is imported" in result.stdout
+
+
+def test_create_sorted_bundle_indices_for_unified_pg():
+    """Test that sorted bundle indices are created for a unified placement group."""
+    cluster = RayVirtualCluster(bundle_ct_per_node_list=[2], use_gpus=True)
+    cluster._init_placement_groups(strategy=None, use_unified_pg=True)
+    assert cluster._sorted_bundle_indices is not None
+    assert len(cluster._sorted_bundle_indices) == 2
+    assert 0 in cluster._sorted_bundle_indices
+    assert 1 in cluster._sorted_bundle_indices
+
+
+def test_not_create_sorted_bundle_indices_for_per_node_pg():
+    """Test that sorted bundle indices are not created for a per-node placement group."""
+    cluster = RayVirtualCluster(bundle_ct_per_node_list=[2], use_gpus=True)
+    cluster._init_placement_groups(strategy=None, use_unified_pg=False)
+    assert cluster._sorted_bundle_indices is None

Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,9 @@ def setup(`
`248`	`248`	`init_optimizer=True,`
`249`	`249`	`init_reference_model=True,`
`250`	`250`	`)`
	`251`	`+ # print the node IP and GPU ID of the policy workers for debugging`
	`252`	`+ policy.print_node_ip_and_gpu_id()`
	`253`	`+`
`251`	`254`	`loss_fn = DPOLossFn(master_config["dpo"])`
`252`	`255`	`print(" ✓ Model initialized")`
`253`	`256`
Original file line number	Diff line number	Diff line change
`@@ -482,6 +482,8 @@ def setup(`
`482`	`482`	`optimizer_path=optimizer_path,`
`483`	`483`	`init_optimizer=True,`
`484`	`484`	`)`
	`485`	`+ # print the node IP and GPU ID of the policy workers for debugging`
	`486`	`+ policy.print_node_ip_and_gpu_id()`
`485`	`487`
`486`	`488`	`# if it is not colocated inference, initialize collective communication for update weights`
`487`	`489`	`if not colocated_inference:`
Original file line number	Diff line number	Diff line change
`@@ -223,6 +223,9 @@ def setup(`
`223`	`223`	`init_optimizer=True,`
`224`	`224`	`init_reference_model=False,`
`225`	`225`	`)`
	`226`	`+ # print the node IP and GPU ID of the policy workers for debugging`
	`227`	`+ policy.print_node_ip_and_gpu_id()`
	`228`	`+`
`226`	`229`	`loss_fn = PreferenceLoss()`
`227`	`230`	`print(" ✓ Model initialized")`
`228`	`231`
Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,9 @@ def setup(`
`202`	`202`	`init_optimizer=True,`
`203`	`203`	`init_reference_model=False,`
`204`	`204`	`)`
	`205`	`+ # print the node IP and GPU ID of the policy workers for debugging`
	`206`	`+ policy.print_node_ip_and_gpu_id()`
	`207`	`+`
`205`	`208`	`loss_fn = NLLLoss()`
`206`	`209`	`print(" ✓ Model initialized")`
`207`	`210`