44from lightllm .server import TokenLoad
55from .api_lightllm import lightllm_generate , lightllm_generate_stream
66from .api_tgi import tgi_generate_impl , tgi_generate_stream_impl
7- from lightllm .utils .net_utils import alloc_can_use_network_port
7+ from lightllm .utils .net_utils import alloc_can_use_network_port , PortLocker
88from lightllm .utils .start_utils import start_submodule_processes
99from .metrics .manager import start_metric_manager
1010from .embed_cache .manager import start_cache_manager
@@ -27,6 +27,15 @@ def normal_or_p_d_start(g_objs):
2727 if args .run_mode not in ["normal" , "prefill" , "decode" ]:
2828 return
2929
30+ assert args .zmq_mode in ["tcp://" , "ipc:///tmp/" ]
31+
32+ # 确保单机上多实列不冲突
33+ if args .zmq_mode == "ipc:///tmp/" :
34+ zmq_mode = f"{ args .zmq_mode } _{ str (args .nccl_port )} _"
35+ args .zmq_mode = None # args 的参数不能直接设置,只能先设置None,再设置才能成功
36+ args .zmq_mode = zmq_mode
37+ logger .info (f"zmq mode head: { args .zmq_mode } " )
38+
3039 if args .use_tgi_api :
3140 g_objs .g_generate_func = tgi_generate_impl
3241 g_objs .g_generate_stream_func = tgi_generate_stream_impl
@@ -117,9 +126,18 @@ def normal_or_p_d_start(g_objs):
117126 assert args .data_type in ["fp16" , "float16" , "bf16" , "bfloat16" , "fp32" , "float32" ]
118127
119128 already_uesd_ports = args .visual_nccl_ports + [args .nccl_port , args .port ]
129+ if args .run_mode == "decode" :
130+ already_uesd_ports = args .visual_nccl_ports + [args .nccl_port , args .port , args .pd_decode_rpyc_port ]
131+
132+ # 提前锁定端口,防止在单个机器上启动多个实列的时候,要到模型启动的时候才能
133+ # 捕获到端口设置冲突的问题
134+ ports_locker = PortLocker (already_uesd_ports )
135+ ports_locker .lock_port ()
136+
120137 can_use_ports = alloc_can_use_network_port (
121138 num = 6 + args .tp + args .tp + args .visual_dp * args .visual_tp , used_nccl_ports = already_uesd_ports
122139 )
140+ logger .info (f"alloced ports: { can_use_ports } " )
123141 router_port , detokenization_port , httpserver_port , visual_port , cache_port , metric_port = can_use_ports [0 :6 ]
124142 model_rpc_ports = can_use_ports [6 : 6 + args .tp ]
125143 can_use_ports = can_use_ports [6 + args .tp :]
@@ -144,6 +162,8 @@ def normal_or_p_d_start(g_objs):
144162
145163 logger .info (f"all start args:{ args } " )
146164
165+ ports_locker .release_port ()
166+
147167 if args .enable_multimodal :
148168 start_submodule_processes (
149169 start_funcs = [
0 commit comments