fix: stop setting device_id for compatibility

zhc7 · zhc7 · commit c70834d6b0cb · 2025-10-13T13:43:29.000+08:00
(cherry picked from commit 54ff390c9a04418b3c123d8e7a2037ecdb42d8ea)
diff --git a/trainer/src/agentrl/trainer/components/nccl_tensor_comm.py b/trainer/src/agentrl/trainer/components/nccl_tensor_comm.py
@@ -42,7 +42,6 @@ def __init__(self, worker: AbstractTrainWorker, addr, port, world_size):
                 world_size=world_size,
                 rank=0,
                 group_name=f"nccl_comm_{addr}_{port}",
-                device_id=torch.device(torch.cuda.current_device()),
             )
 
     def send(self, bucket_size):
@@ -85,7 +84,6 @@ def __init__(self, worker: AbstractAsyncRolloutWorker, addr, port, world_size, o
             world_size=world_size,
             rank=offset + worker.rank,
             group_name=f"nccl_comm_{addr}_{port}",
-            device_id=torch.device("cuda:0"),
         )
 
     async def async_receive(self):
diff --git a/trainer/src/agentrl/trainer/workers/fsdp_worker.py b/trainer/src/agentrl/trainer/workers/fsdp_worker.py
@@ -82,12 +82,6 @@ class FSDPWorker(AbstractTrainWorker):
 
     def __init__(self, config):
         super().__init__()
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            device = os.environ.pop("CUDA_VISIBLE_DEVICES")
-            os.environ["LOCAL_RANK"] = device
-        else:
-            device = os.environ["LOCAL_RANK"]
-        torch.cuda.set_device(f"cuda:{device}")
         self.config = config
 
     def init_distributed(self, addr, port):

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,6 @@ def __init__(self, worker: AbstractTrainWorker, addr, port, world_size):`
`42`	`42`	`world_size=world_size,`
`43`	`43`	`rank=0,`
`44`	`44`	`group_name=f"nccl_comm_{addr}_{port}",`
`45`		`- device_id=torch.device(torch.cuda.current_device()),`
`46`	`45`	`)`
`47`	`46`
`48`	`47`	`def send(self, bucket_size):`
`@@ -85,7 +84,6 @@ def __init__(self, worker: AbstractAsyncRolloutWorker, addr, port, world_size, o`
`85`	`84`	`world_size=world_size,`
`86`	`85`	`rank=offset + worker.rank,`
`87`	`86`	`group_name=f"nccl_comm_{addr}_{port}",`
`88`		`- device_id=torch.device("cuda:0"),`
`89`	`87`	`)`
`90`	`88`
`91`	`89`	`async def async_receive(self):`