Skip to content

Commit fbd5c60

Browse files
committed
use node_nccl_group
1 parent fd0511e commit fbd5c60

File tree

1 file changed

+4
-4
lines changed
  • lightllm/server/router/model_infer/mode_backend/dp_backend

1 file changed

+4
-4
lines changed

lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ def init_custom(self):
7777

7878
mp.reductions.reduce_tensor.__code__ = reduce_tensor.__code__
7979

80-
# 每个rank创建自己的共享内存并写入mem_manager
8180
self.model.mem_manager.create_shm()
8281

83-
# 读取所有rank的mem_manager
82+
dist.barrier(group=self.node_nccl_group)
83+
8484
self.mem_managers = []
8585
for rank_idx in range(self.node_world_size):
8686
if rank_idx != self.rank_in_node:
@@ -152,7 +152,7 @@ def _fetch_dp_prompt_cache(self, infer_reqs: List[InferReq], other_reqs: List[Tu
152152
other_match.append((shm_req, kv_len, value_tensor))
153153

154154
# wait all the ranks to finish the match
155-
dist.barrier()
155+
dist.barrier(group=self.node_nccl_group)
156156

157157
# Copy the kv_indexes of this dp rank to other required req
158158
for match in other_match:
@@ -163,7 +163,7 @@ def _fetch_dp_prompt_cache(self, infer_reqs: List[InferReq], other_reqs: List[Tu
163163
self.release_all_shm_reqs([match[0] for match in other_match])
164164

165165
# wait all the ranks to finish the copy
166-
dist.barrier()
166+
dist.barrier(group=self.node_nccl_group)
167167

168168
# Perform a kv transfer, get all indexes and the corresponding dp_rank
169169
move_token_indexes = []

0 commit comments

Comments
 (0)