fix: fix non-colocated with cpu_offload enabled (#861)

yuki-97 · web-flow · commit bbbb3d6f67fa · 2025-08-08T15:26:15.000Z
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -1224,8 +1224,11 @@ def prepare_weights_for_ipc(self) -> tuple[list[tuple[str, int]], float]:
         """
         from nemo_rl.utils.nvml import get_free_memory_bytes
 
+        # Manually move model to cuda for cpu offload case
+        if self.cpu_offload:
+            self.model = self.move_to_cuda(self.model)
+
         # Get state_dict
-        self.model = self.move_to_cuda(self.model)
         self._held_sharded_state_dict_reference: dict[str, torch.Tensor] = (
             self.model.state_dict()
         )
@@ -1283,13 +1286,27 @@ def get_weights_ipc_handles(self, keys: Iterable[str]) -> dict[str, Any]:
     @torch.no_grad()
     def broadcast_weights_for_collective(self) -> None:
         """Broadcast the weights for collective communication."""
+        # Manually move model to cuda for cpu offload case
+        if self.cpu_offload:
+            print(
+                "[WARNING]: Unless you are lacking of memory, it is not recommended to enable cpu_offload when "
+                "using non-colocated generation since it will have an extra onload and offload at refit stage."
+            )
+            self.model = self.move_to_cuda(self.model)
+
+        # Broadcast the weights for collective communication
         for _, tensor in self.model.state_dict().items():
             if isinstance(tensor, DTensor):
                 tensor = tensor.full_tensor()
             if self.rank == 0:
                 tensor = tensor.to(self.dtype, non_blocking=True)
                 self.model_update_group.broadcast(tensor.data, src=0)
 
+        # Manually move model to cpu for cpu offload case
+        # cpu offload needs model on CPU before model forward
+        if self.cpu_offload:
+            self.model = self.move_to_cpu(self.model)
+
     def prepare_for_lp_inference(self) -> None:
         if not self.cpu_offload:
             self.move_to_cuda(self.model)
@@ -1308,9 +1325,6 @@ def prepare_for_training(self, *args, **kwargs) -> None:
             # to cuda automatically, so we need to do that manually
             self.model = self.move_buffer_to_device(self.model, "cuda")
 
-        # have to move buffers to cuda manually for cpu offload case
-        self.move_buffer_to_device(self.model, "cuda")
-
         self.model.train()
         # Move optimizer state to CUDA if it exists
         if (
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
@@ -623,29 +623,16 @@ def configure_worker_fixed_seed(num_gpus, bundle_indices=None):
         torch.cuda.empty_cache()
 
 
-@pytest.mark.timeout(360)
-@pytest.mark.asyncio
-@pytest.mark.parametrize("async_engine", [True, False])
-async def test_vllm_generation_with_hf_training(cluster, tokenizer, async_engine):
-    """1. Use vLLM for generation
-    2. Use HF policy for training and logprob computation
+async def run_hf_train_process(
+    lm_policy, vllm_policy, tokenizer, async_engine, colocated
+):
+    """Validates that the two policies can work together.
 
-    This test validates that the two policies can work together.
+    1. Use vLLM for generation
+    2. Use HF policy for training and logprob computation
     """
-    from nemo_rl.models.policy.lm_policy import Policy
     from tests.unit.test_utils import SimpleNLLLoss
 
-    # Create separate configs for each policy
-    vllm_config = deepcopy(basic_vllm_test_config)
-    vllm_config["vllm_cfg"]["async_engine"] = async_engine
-    vllm_config = configure_generation_config(vllm_config, tokenizer)
-
-    dtensor_config = deepcopy(basic_dtensor_test_config)
-    dtensor_config["train_global_batch_size"] = 4
-
-    vllm_policy = None
-    lm_policy = None
-
     try:
         prompts = [
             "Write a story about a magical forest",
@@ -677,22 +664,8 @@ async def test_vllm_generation_with_hf_training(cluster, tokenizer, async_engine
             }
         )
 
-        # Create both policies
-        print("Creating vLLM policy...")
-        vllm_policy = VllmGeneration(cluster, vllm_config)
-        vllm_policy.finish_generation()
-
-        print("Creating DTensor policy...")
-        lm_policy = Policy(cluster, dtensor_config, tokenizer)
-
-        print("preparing refit info...")
-        state_dict_info = lm_policy.prepare_refit_info()
-        vllm_policy.prepare_refit_info(state_dict_info)
-
         print("refitting vllm policy...")
-        refit_policy_generation(
-            lm_policy, vllm_policy, vllm_config["colocated"]["enabled"]
-        )
+        refit_policy_generation(lm_policy, vllm_policy, colocated)
 
         # Step 1: Use vLLM for generation
         print("Using vLLM policy for fast generation...")
@@ -794,7 +767,7 @@ async def test_vllm_generation_with_hf_training(cluster, tokenizer, async_engine
         print(f"Training loss: {results['loss']}")
 
         lm_policy.finish_training()
-        lm_policy.offload_after_refit()
+        refit_policy_generation(lm_policy, vllm_policy, colocated)
 
         # Step 4: Use vLLM for generation again to complete the workflow
         print("Using vLLM for generation again...")
@@ -821,6 +794,82 @@ async def test_vllm_generation_with_hf_training(cluster, tokenizer, async_engine
             lm_policy.shutdown()
 
 
+@pytest.mark.timeout(300)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("async_engine", "cpu_offload"), [(True, False), (False, True)]
+)
+async def test_vllm_generation_with_hf_training_colocated(
+    cluster, tokenizer, async_engine, cpu_offload
+):
+    """This test validates that DTensor policy can work together with colocated vLLM policy."""
+    # Create VllmGeneration Policy
+    print("Creating vLLM policy...")
+    vllm_config = deepcopy(basic_vllm_test_config)
+    vllm_config["vllm_cfg"]["async_engine"] = async_engine
+    vllm_config = configure_generation_config(vllm_config, tokenizer)
+    vllm_policy = VllmGeneration(cluster, vllm_config)
+    vllm_policy.finish_generation()
+
+    # Create Policy
+    print("Creating DTensor policy...")
+    dtensor_config = deepcopy(basic_dtensor_test_config)
+    dtensor_config["dtensor_cfg"]["cpu_offload"] = cpu_offload
+    dtensor_config["train_global_batch_size"] = 4
+    lm_policy = Policy(cluster, dtensor_config, tokenizer)
+
+    # Prepare refit info
+    print("Preparing refit info...")
+    state_dict_info = lm_policy.prepare_refit_info()
+    vllm_policy.prepare_refit_info(state_dict_info)
+
+    # Test
+    await run_hf_train_process(lm_policy, vllm_policy, tokenizer, async_engine, True)
+
+
+@pytest.mark.timeout(300)
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("async_engine", "cpu_offload"), [(True, False), (False, True)]
+)
+async def test_vllm_generation_with_hf_training_non_colocated(
+    policy_cluster_separate, tokenizer, async_engine, cpu_offload
+):
+    """This test validates that DTensor policy can work together with non-colocated vLLM policy."""
+    generation_cluster_separate = get_generation_cluster_separate(1)
+
+    # Create VllmGeneration Policy
+    print("Creating vLLM policy...")
+    vllm_config = deepcopy(basic_vllm_test_config)
+    vllm_config["vllm_cfg"]["async_engine"] = async_engine
+    vllm_config["colocated"]["enabled"] = False
+    vllm_config = configure_generation_config(vllm_config, tokenizer)
+    vllm_policy = VllmGeneration(generation_cluster_separate, vllm_config)
+    vllm_policy.finish_generation()
+
+    # Create Policy
+    print("Creating DTensor policy...")
+    dtensor_config = deepcopy(basic_dtensor_test_config)
+    dtensor_config["generation"]["colocated"]["enabled"] = False
+    dtensor_config["dtensor_cfg"]["cpu_offload"] = cpu_offload
+    dtensor_config["train_global_batch_size"] = 4
+    lm_policy = Policy(policy_cluster_separate, dtensor_config, tokenizer)
+
+    # Refit
+    # initialize collective communication for update weights
+    ip, port = policy_cluster_separate.get_master_address_and_port()
+    futures_train = lm_policy.init_collective(ip, port, world_size=2)
+    futures_inference = vllm_policy.init_collective(ip, port, world_size=2)
+    ray.get(futures_train + futures_inference)
+
+    # prepare refit info
+    state_dict_info = lm_policy.prepare_refit_info()
+    vllm_policy.prepare_refit_info(state_dict_info)
+
+    # Test
+    await run_hf_train_process(lm_policy, vllm_policy, tokenizer, async_engine, False)
+
+
 def test_vllm_policy_tensor_parallel(cluster, tokenizer):
     """Test vLLM policy with tensor parallelism > 1."""
     # Configure with tensor_parallel_size=2