enable rdma for weight sync

casteryh · casteryh · commit 001a6b6ad11a · 2025-10-04T18:20:28.000-07:00
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -42,7 +42,7 @@ policy:
 
 # Trainer configuration
 trainer:
-  use_dcp: true
+  use_dcp: false
   use_vllm_builtin_load: true
   model:
     name: qwen3
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
@@ -403,7 +403,8 @@ async def push_weights(self, policy_version: int) -> None:
         else:
             for name, param in hf_state_dict.items():
                 key = get_param_key(policy_version, name)
-                await ts.put(key, param)
+                # RDMA is still broken on GPU, so we need to copy to CPU
+                await ts.put(key, param.detach().cpu())
             t.step("ts_save")
         t.stop()
         end_time = time.perf_counter()