Skip to content

Commit 02b3591

Browse files
committed
fix pd split bug for multi node.
1 parent 67919d8 commit 02b3591

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

lightllm/server/router/manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ async def wait_to_model_ready(self):
116116
# 用于 kv move 管理进程 和 推理进程进行task信息的交互。
117117
self.info_queue: mp.Queue = mp.Queue()
118118
self.mem_queues: List[torch.multiprocessing.Queue] = [
119-
torch.multiprocessing.Queue() for _ in range(self.world_size)
119+
torch.multiprocessing.Queue() for _ in range(self.node_world_size)
120120
]
121121
self.rpc_event = multiprocessing.Event()
122122
self.rpc_finished_event = multiprocessing.Event()
@@ -132,7 +132,7 @@ async def wait_to_model_ready(self):
132132
rpc_event=self.rpc_event,
133133
rpc_finished_event=self.rpc_finished_event,
134134
info_queue=self.info_queue,
135-
mem_queue=self.mem_queues[rank_id],
135+
mem_queue=self.mem_queues[(rank_id % node_world_size)],
136136
router_lock=self.router_lock,
137137
)
138138
self.model_rpc_servers.append(rpc_model)

0 commit comments

Comments
 (0)