Skip to content

Commit 93cb841

Browse files
committed
fix pd mem_manager get failed
1 parent 14473db commit 93cb841

File tree

4 files changed

+11
-1
lines changed

4 files changed

+11
-1
lines changed

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_trans_obj.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,11 @@ def init_all(self, device_id: int, manager: "DecodeKVMoveManager"):
281281
self.task_out_queue,
282282
)
283283
assert self.task_out_queue.get(timeout=30) == "proc_start"
284+
# 确保在子进程读取共享内存之前,主进程已经将 mem_manager 写入共享内存
284285
if self.device_id == 0:
285286
manager._put_mem_manager_to_shm()
287+
# 通知子进程可以从共享内存读取 mem_manager
288+
self.task_in_queue.put("mem_managers_ready")
286289
assert self.task_out_queue.get(timeout=60) == "get_mem_managers_ok"
287290

288291
return True

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_trans_process.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ def _init_env(args, device_id: int, task_in_queue: mp.Queue, task_out_queue: mp.
111111
graceful_registry(inspect.currentframe().f_code.co_name)
112112
task_out_queue.put("proc_start")
113113

114+
# 等待主进程将 mem_manager 写入共享内存后的信号
115+
assert task_in_queue.get(timeout=60) == "mem_managers_ready"
116+
114117
# 从共享内存读取所有rank的mem_manager
115118
node_world_size = args.tp // args.nnodes
116119
mem_managers: List[MemoryManager] = [MemoryManager.from_shm(rank, device_id) for rank in range(node_world_size)]
@@ -136,7 +139,7 @@ def _init_env(args, device_id: int, task_in_queue: mp.Queue, task_out_queue: mp.
136139
logger.warning(f"unexpected task type: {task}")
137140

138141
except Exception as e:
139-
logger.error(f"Fatal error happened in kv trans process: {e}")
142+
logger.error(f"Fatal error happened in kv trans process: {e} in device {device_id}")
140143
raise
141144

142145

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_trans_obj.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ def init_all(self, device_id: int, manager: "PrefillKVMoveManager"):
357357
assert self.task_out_queue.get(timeout=30) == "proc_start"
358358
if self.device_id == 0:
359359
manager._put_mem_manager_to_shm()
360+
self.task_in_queue.put("mem_managers_ready")
360361
assert self.task_out_queue.get(timeout=60) == "get_mem_managers_ok"
361362

362363
return True

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_trans_process.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ def _init_env(
116116
)
117117
task_out_queue.put("proc_start")
118118

119+
# 等待主进程将 mem_manager 写入共享内存后的信号
120+
assert task_in_queue.get(timeout=60) == "mem_managers_ready"
121+
119122
# 从共享内存读取所有rank的mem_manager
120123
node_world_size = args.tp // args.nnodes
121124
mem_managers: List[MemoryManager] = [MemoryManager.from_shm(rank, device_id) for rank in range(node_world_size)]

0 commit comments

Comments
 (0)