fix create new group for current dp

hiworldwzj · hiworldwzj · commit f3ebb9da8ded · 2025-05-15T17:47:18.000+08:00
diff --git a/lightllm/distributed/communication_op.py b/lightllm/distributed/communication_op.py
@@ -27,12 +27,11 @@
 from lightllm.utils.device_utils import has_nvlink
 from lightllm.utils.envs_utils import get_env_start_args, get_deepep_num_max_dispatch_tokens_per_rank
 from lightllm.utils.dist_utils import (
-    get_current_device_id,
-    get_node_world_size,
     get_global_world_size,
     get_dp_world_size,
     get_global_rank,
     get_current_rank_in_dp,
+    create_new_group_for_current_dp,
 )
 from lightllm.utils.device_utils import get_device_sm_count
 from lightllm.utils.sgl_utils import HAS_SGL_KERNEL
@@ -63,17 +62,15 @@ def __init__(self):
         self.custom_reduce = None
         self.custom_gather = None
         self.dp_world_size = get_dp_world_size()
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        self.device_group = dist.new_group(ranks, backend="nccl")
+        self.device_group = create_new_group_for_current_dp("nccl")
 
     def init_custom_reduce(self) -> None:
         if not HAS_SGL_KERNEL or not has_nvlink() or self.dp_world_size not in [2, 4, 6, 8]:
             return
         args = get_env_start_args()
         if args.disable_custom_allreduce:
             return
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        cpu_group = dist.new_group(ranks, backend="gloo")
+        cpu_group = create_new_group_for_current_dp("gloo")
         self.custom_reduce = CustomAllreduce(cpu_group, torch.cuda.current_device())
         logger.info("Enable Custom ALLReduce. You can disable it by settting --disable_custom_allreduce.")
 
@@ -84,8 +81,8 @@ def init_custom_gather(self) -> None:
         args = get_env_start_args()
         if args.disable_custom_allgather:
             return
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        cpu_group = dist.new_group(ranks, backend="gloo")
+
+        cpu_group = create_new_group_for_current_dp("gloo")
         self.custom_gather = CustomAllgather(cpu_group, torch.cuda.current_device())
         logger.info("Enable Custom ALLGather.  You can disable it by settting --disable_custom_allgather")
 
diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_impl.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_impl.py
@@ -19,6 +19,7 @@
 from .decode_task_cache import g_success_kv_move_task_cache, KVMoveTask
 from lightllm.utils.device_utils import kv_trans_use_p2p
 from lightllm.utils.envs_utils import get_unique_server_name
+from lightllm.utils.dist_utils import create_new_group_for_current_dp
 
 logger = init_logger(__name__)
 
@@ -30,11 +31,8 @@ def __init__(self, info_queue: mp.Queue, mem_queue: mp.Queue) -> None:
         self.mem_queue: mp.Queue = mem_queue
 
     def init_custom(self):
-        ranks = []
-        for i in range(self.dp_world_size):
-            ranks.append(i + self.global_dp_rank * self.dp_world_size)
 
-        self.lock_nccl_group = dist.new_group(ranks=ranks, backend="gloo")
+        self.lock_nccl_group = create_new_group_for_current_dp("gloo")
         logger.info(f"lock_nccl_group ranks {dist.get_rank(self.lock_nccl_group)}")
 
         from .decode_infer_rpyc import PDDecodeInferRpcServer
diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_impl.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_impl.py
@@ -19,6 +19,7 @@
 from .prefill_task_cache import g_kv_move_task_cache
 from lightllm.utils.device_utils import kv_trans_use_p2p
 from lightllm.utils.envs_utils import get_unique_server_name
+from lightllm.utils.dist_utils import create_new_group_for_current_dp
 
 logger = init_logger(__name__)
 
@@ -30,11 +31,8 @@ def __init__(self, info_queue: mp.Queue, mem_queue: mp.Queue) -> None:
         self.mem_queue: mp.Queue = mem_queue
 
     def init_custom(self):
-        ranks = []
-        for i in range(self.dp_world_size):
-            ranks.append(i + self.global_dp_rank * self.dp_world_size)
 
-        self.lock_nccl_group = dist.new_group(ranks=ranks, backend="gloo")
+        self.lock_nccl_group = create_new_group_for_current_dp("gloo")
         logger.info(f"lock_nccl_group ranks {dist.get_rank(self.lock_nccl_group)}")
 
         from .prefill_infer_rpyc import PDPrefillInferRpcServer
diff --git a/lightllm/utils/dist_utils.py b/lightllm/utils/dist_utils.py
@@ -189,3 +189,13 @@ def set_node_world_size(node_world_size: int):
 
 def get_node_world_size():
     return int(get_environ("LIGHTLLM_NODE_WORLD_SIZE"))
+
+
+def create_new_group_for_current_dp(backend):
+    ans_group = None
+    for iter_dp_rank in range(get_dp_size()):
+        ranks = list(i + iter_dp_rank * get_dp_world_size() for i in range(get_dp_world_size()))
+        device_group = dist.new_group(ranks, backend=backend)
+        if get_global_dp_rank() == iter_dp_rank:
+            ans_group = device_group
+    return ans_group