draft

youngeunkwon0405 · youngeunkwon0405 · commit 33e64a5f3808 · 2025-10-03T00:49:38.000-07:00
Signed-off-by: Youngeun Kwon &lt;youngeunk@nvidia.com&gt;
diff --git a/nemo_rl/algorithms/distillation.py b/nemo_rl/algorithms/distillation.py
@@ -424,11 +424,16 @@ def setup(
     if not colocated_inference:
         ip, port = train_cluster.get_master_address_and_port()
         print(f"Using ip: {ip}, port: {port} for collective communication", flush=True)
+        train_world_size = train_cluster.world_size()
         # inference cluster + head node of the train cluster
-        world_size = inference_nodes * inference_gpus_per_node + 1
+        world_size = train_world_size + inference_nodes * inference_gpus_per_node
         # init collective
-        futures_train = student_policy.init_collective(ip, port, world_size)
-        futures_inference = student_generation.init_collective(ip, port, world_size)  # type: ignore
+        futures_train = student_policy.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )
+        futures_inference = student_generation.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )  # type: ignore
         # wait for all futures to complete
         ray.get(futures_train + futures_inference)
 
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -432,11 +432,17 @@ def setup(
     if not colocated_inference:
         ip, port = train_cluster.get_master_address_and_port()
         print(f"Using ip: {ip}, port: {port} for collective communication", flush=True)
-        # inference cluster + head node of the train cluster
-        world_size = inference_nodes * inference_gpus_per_node + 1
+        # world includes all training workers and all inference workers
+        train_world_size = train_cluster.world_size()
+        inference_world_size = inference_nodes * inference_gpus_per_node
+        world_size = train_world_size + inference_world_size
         # init collective
-        futures_train = policy.init_collective(ip, port, world_size)
-        futures_inference = policy_generation.init_collective(ip, port, world_size)  # type: ignore
+        futures_train = policy.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )
+        futures_inference = policy_generation.init_collective(
+            ip, port, world_size, train_world_size=train_world_size
+        )  # type: ignore
         # wait for all futures to complete
         ray.get(futures_train + futures_inference)
 
diff --git a/nemo_rl/models/generation/vllm/vllm_backend.py b/nemo_rl/models/generation/vllm/vllm_backend.py
@@ -32,14 +32,20 @@
 
 class VllmInternalWorkerExtension:
     def init_collective(
-        self, rank_prefix: int, ip: str, port: int, world_size: int
+        self,
+        rank_prefix: int,
+        ip: str,
+        port: int,
+        world_size: int,
+        train_world_size: int,
     ) -> None:
         """Initialize the collective communication."""
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.utils import StatelessProcessGroup
 
         local_rank = torch.distributed.get_rank()
-        rank = rank_prefix + local_rank + 1  # 1 is the head node of the train cluster
+        # Place vLLM ranks after all training ranks so all training workers can join
+        rank = train_world_size + rank_prefix + local_rank
 
         pg = StatelessProcessGroup.create(
             host=ip, port=port, rank=rank, world_size=world_size
diff --git a/nemo_rl/models/generation/vllm/vllm_generation.py b/nemo_rl/models/generation/vllm/vllm_generation.py
@@ -368,7 +368,7 @@ def _post_init(self):
         return results
 
     def init_collective(
-        self, ip: str, port: int, world_size: int
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
     ) -> list[ray.ObjectRef]:
         """Initialize the collective communication."""
         if not self.worker_group or not self.worker_group.workers:
@@ -395,7 +395,12 @@ def init_collective(
             method_name,
             rank_prefix=rank_prefix_list,
             run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
-            common_kwargs={"ip": ip, "port": port, "world_size": world_size},
+            common_kwargs={
+                "ip": ip,
+                "port": port,
+                "world_size": world_size,
+                "train_world_size": train_world_size,
+            },
         )
 
         # this function should co-work with lm_policy, so we should wait for all futures to complete outside
diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py
@@ -477,7 +477,12 @@ def post_init(self):
         self.vllm_device_ids = self.report_device_id()
 
     def init_collective(
-        self, rank_prefix: int, ip: str, port: int, world_size: int
+        self,
+        rank_prefix: int,
+        ip: str,
+        port: int,
+        world_size: int,
+        train_world_size: int,
     ) -> None:
         self.llm.collective_rpc(
             "init_collective",
@@ -486,6 +491,7 @@ def init_collective(
                 ip,
                 port,
                 world_size,
+                train_world_size,
             ),
         )
 
diff --git a/nemo_rl/models/generation/vllm/vllm_worker_async.py b/nemo_rl/models/generation/vllm/vllm_worker_async.py
@@ -393,7 +393,12 @@ def _setup_vllm_server(self) -> "tuple[threading.Thread, str, uvicorn.Server]":
         return thread, base_url, server
 
     async def init_collective_async(
-        self, rank_prefix: int, ip: str, port: int, world_size: int
+        self,
+        rank_prefix: int,
+        ip: str,
+        port: int,
+        world_size: int,
+        train_world_size: int,
     ) -> None:
         await self.llm.collective_rpc(
             "init_collective",
@@ -402,6 +407,7 @@ async def init_collective_async(
                 ip,
                 port,
                 world_size,
+                train_world_size,
             ),
         )
 
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -501,17 +501,18 @@ def train_context(cp_context: Optional[Generator[None, None, None]] = None):
 
             yield
 
-    def init_collective(self, ip: str, port: int, world_size: int) -> None:
+    def init_collective(
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
+    ) -> None:
         """Initialize the collective communication."""
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.utils import StatelessProcessGroup
 
-        if self.rank == 0:
-            pg = StatelessProcessGroup.create(
-                host=ip, port=port, rank=0, world_size=world_size
-            )
-            device = torch.cuda.current_device()
-            self.model_update_group = PyNcclCommunicator(pg, device=device)
+        pg = StatelessProcessGroup.create(
+            host=ip, port=port, rank=self.rank, world_size=world_size
+        )
+        device = torch.cuda.current_device()
+        self.model_update_group = PyNcclCommunicator(pg, device=device)
 
     def is_alive(self) -> bool:
         return True
@@ -1808,9 +1809,8 @@ def broadcast_weights_for_collective(self) -> None:
         for _, tensor in self.model.state_dict().items():
             if isinstance(tensor, DTensor):
                 tensor = tensor.full_tensor()
-            if self.rank == 0:
-                tensor = tensor.to(self.dtype, non_blocking=True)
-                self.model_update_group.broadcast(tensor.data, src=0)
+            tensor = tensor.to(self.dtype, non_blocking=True)
+            self.model_update_group.broadcast(tensor.data, src=0)
 
         # Manually move model to cpu for cpu offload case
         # cpu offload needs model on CPU before model forward
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -459,17 +459,17 @@ def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor:
             logits.div_(self.cfg["generation"]["temperature"])
         return logits
 
-    def init_collective(self, ip: str, port: int, world_size: int) -> None:
-        """Initialize the collective communication."""
+    def init_collective(
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
+    ) -> None:
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.utils import StatelessProcessGroup
 
-        if self.rank == 0:
-            pg = StatelessProcessGroup.create(
-                host=ip, port=port, rank=0, world_size=world_size
-            )
-            device = torch.cuda.current_device()
-            self.model_update_group = PyNcclCommunicator(pg, device=device)
+        pg = StatelessProcessGroup.create(
+            host=ip, port=port, rank=self.rank, world_size=world_size
+        )
+        device = torch.cuda.current_device()
+        self.model_update_group = PyNcclCommunicator(pg, device=device)
 
     def is_alive(self) -> bool:
         return True
@@ -1770,9 +1770,8 @@ def broadcast_weights_for_collective(self) -> None:
         for _, tensor in self.model.state_dict().items():
             if isinstance(tensor, DTensor):
                 tensor = tensor.full_tensor()
-            if self.rank == 0:
-                tensor = tensor.to(self.dtype, non_blocking=True)
-                self.model_update_group.broadcast(tensor.data, src=0)
+            tensor = tensor.to(self.dtype, non_blocking=True)
+            self.model_update_group.broadcast(tensor.data, src=0)
 
         # Manually move model to cpu for cpu offload case
         # cpu offload needs model on CPU before model forward
diff --git a/nemo_rl/models/policy/interfaces.py b/nemo_rl/models/policy/interfaces.py
@@ -141,7 +141,7 @@ def shutdown(self) -> bool:
 class ColocatablePolicyInterface(PolicyInterface):
     @abstractmethod
     def init_collective(
-        self, ip: str, port: int, world_size: int
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
     ) -> list[ray.ObjectRef]:
         pass
 
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
@@ -234,11 +234,15 @@ def __init__(
         self.cfg = config
 
     def init_collective(
-        self, ip: str, port: int, world_size: int
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
     ) -> list[ray.ObjectRef]:
         """Initialize the collective communication."""
         futures = self.worker_group.run_all_workers_single_data(
-            "init_collective", ip=ip, port=port, world_size=world_size
+            "init_collective",
+            ip=ip,
+            port=port,
+            world_size=world_size,
+            train_world_size=train_world_size,
         )
         # this function should co-work with vllm, so we should wait for all futures to complete outside
         return futures
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -820,17 +820,19 @@ def __init__(
         ## used for streaming update inference engine weights
         self._held_gather_buffer = None
 
-    def init_collective(self, ip: str, port: int, world_size: int) -> None:
+    def init_collective(
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
+    ) -> None:
         """Initialize the collective communication."""
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.utils import StatelessProcessGroup
 
-        if self.rank == 0:
-            pg = StatelessProcessGroup.create(
-                host=ip, port=port, rank=0, world_size=world_size
-            )
-            device = torch.cuda.current_device()
-            self.model_update_group = PyNcclCommunicator(pg, device=device)
+        # All training ranks [0..train_world_size-1] join the communicator with their rank
+        pg = StatelessProcessGroup.create(
+            host=ip, port=port, rank=self.rank, world_size=world_size
+        )
+        device = torch.cuda.current_device()
+        self.model_update_group = PyNcclCommunicator(pg, device=device)
 
     def is_alive(self):
         return True
@@ -1735,10 +1737,9 @@ def broadcast_weights_for_collective(self) -> None:
             [self.model],
             show_progress=False,
         )
-        # broadcast from train rank0 worker to inference workers
+        # broadcast from train rank 0 to all other ranks (training and inference)
         for _, tensor in hf_params_generator:
-            if self.rank == 0:
-                self.model_update_group.broadcast(tensor, src=0)
+            self.model_update_group.broadcast(tensor, src=0)
 
     def prepare_for_lp_inference(self):
         self.model = self.move_model(self.model, "cuda", move_grads=False)