55import subprocess
66import signal
77from lightllm .utils .net_utils import alloc_can_use_network_port , PortLocker
8- from lightllm .utils .start_utils import process_manager , kill_recursive
8+ from lightllm .utils .start_utils import process_manager , kill_recursive , is_multimodal_mode
99from .metrics .manager import start_metric_manager
1010from .embed_cache .manager import start_cache_manager
1111from lightllm .utils .log_utils import init_logger
@@ -157,11 +157,13 @@ def check_and_set_args(args):
157157 assert args .mtp_draft_model_dir is None
158158 assert args .mtp_step == 0
159159
160+ args .enable_multimodal = is_multimodal_mode (args )
160161 # visual_only模式下才需要设置visual_embed_path
161162 if args .visual_embed_path is not None :
162163 assert (
163164 args .run_mode == "visual_only" or args .run_mode == "llm_only"
164165 ), "only visual_only or llm_only mode need visual_embed_path"
166+
165167 # 检查GPU数量是否足够
166168 if args .visual_gpu_ids is None :
167169 args .visual_gpu_ids = list (range (args .visual_dp * args .visual_tp ))
@@ -174,13 +176,11 @@ def check_and_set_args(args):
174176 args .visual_gpu_ids = args .visual_gpu_ids [:total_required_gpus ]
175177
176178 # 检查visual_nccl_port数量是否足够
177- if len (args .visual_nccl_ports ) < args .visual_dp :
179+ if args . visual_nccl_ports is not None and len (args .visual_nccl_ports ) < args .visual_dp :
178180 raise ValueError (
179181 f"Not enough visual_nccl_ports specified. You need at least { args .visual_dp } , "
180182 f"but got ({ len (args .visual_nccl_ports )} )."
181183 )
182- else :
183- args .visual_nccl_ports = args .visual_nccl_ports [: args .visual_dp ]
184184
185185 if args .visual_dp <= 0 :
186186 raise ValueError ("visual_dp must be a positive integer." )
@@ -287,7 +287,6 @@ def normal_or_p_d_start(args):
287287 logger .info (f"all start args:{ args } " )
288288
289289 ports_locker .release_port ()
290-
291290 if args .enable_multimodal :
292291 from .visualserver .manager import start_visual_process
293292
@@ -381,105 +380,6 @@ def normal_or_p_d_start(args):
381380 return
382381
383382
384- def llm_only_start (args ):
385-
386- check_and_set_args (args )
387- already_uesd_ports = [args .nccl_port , args .port ]
388-
389- # 提前锁定端口,防止在单个机器上启动多个实列的时候,要到模型启动的时候才能
390- # 捕获到端口设置冲突的问题
391- ports_locker = PortLocker (already_uesd_ports )
392- ports_locker .lock_port ()
393-
394- node_world_size = args .tp // args .nnodes
395- can_use_ports = alloc_can_use_network_port (num = 4 + node_world_size , used_nccl_ports = already_uesd_ports )
396- logger .info (f"alloced ports: { can_use_ports } " )
397- (
398- router_port ,
399- detokenization_port ,
400- detokenization_pub_port ,
401- metric_port ,
402- ) = can_use_ports [0 :4 ]
403- can_use_ports = can_use_ports [4 :]
404-
405- # 将申请好的端口放入args参数中
406- args .router_port = router_port
407- args .detokenization_port = detokenization_port
408- args .detokenization_pub_port = detokenization_pub_port
409- args .metric_port = metric_port
410-
411- # 申请在 p d 分离模式下,会用的端口
412- args .pd_node_infer_rpyc_ports = can_use_ports [0 :node_world_size ]
413- # p d 分离模式下用于标识节点的id
414- args .pd_node_id = uuid .uuid4 ().int
415- # p 节点用来建立torch kv 传输分布组的可用端口范围
416- args .pd_p_allowed_port_min = 20000
417- args .pd_p_allowed_port_max = 30000
418-
419- # p d 分离模式下,decode节点的调度间隙是0
420- if args .run_mode == "decode" :
421- args .router_max_wait_tokens = 0
422-
423- send_and_receive_node_ip (args ) # 多机用于收发node ip
424- set_env_start_args (args )
425- logger .info (f"all start args:{ args } " )
426-
427- ports_locker .release_port ()
428-
429- process_manager .start_submodule_processes (
430- start_funcs = [
431- start_metric_manager ,
432- ],
433- start_args = [(metric_port , args )],
434- )
435-
436- process_manager .start_submodule_processes (
437- start_funcs = [start_router_process , start_detokenization_process ],
438- start_args = [
439- (args , router_port , detokenization_port , metric_port ),
440- (args , detokenization_port , detokenization_pub_port ),
441- ],
442- )
443-
444- # 启动 gunicorn
445- command = [
446- "gunicorn" ,
447- "--workers" ,
448- f"{ args .httpserver_workers } " ,
449- "--worker-class" ,
450- "uvicorn.workers.UvicornWorker" ,
451- "--bind" ,
452- f"{ args .host } :{ args .port } " ,
453- "--log-level" ,
454- "info" ,
455- "--access-logfile" ,
456- "-" ,
457- "--error-logfile" ,
458- "-" ,
459- "lightllm.server.api_http:app" ,
460- "--timeout" ,
461- f"{ get_lightllm_gunicorn_time_out_seconds ()} " ,
462- "--keep-alive" ,
463- f"{ get_lightllm_gunicorn_keep_alive ()} " ,
464- ]
465-
466- # 启动子进程
467- http_server_process = subprocess .Popen (command )
468-
469- if "s3://" in args .model_dir :
470- from lightllm .utils .petrel_helper import s3_model_clear
471-
472- s3_model_clear (args .model_dir )
473-
474- if args .health_monitor :
475- from lightllm .server .health_monitor .manager import start_health_check_process
476-
477- process_manager .start_submodule_processes (start_funcs = [start_health_check_process ], start_args = [(args ,)])
478- setup_signal_handlers (http_server_process , process_manager )
479- http_server_process .wait ()
480- return
481-
482-
483383def pd_master_start (args ):
484384 set_unique_server_name (args )
485385 if args .run_mode != "pd_master" :
0 commit comments