@@ -362,6 +362,105 @@ def normal_or_p_d_start(args):
362362 return
363363
364364
365+ def llm_only_start (args ):
366+
367+ check_and_set_args (args )
368+ already_uesd_ports = [args .nccl_port , args .port ]
369+
370+ # 提前锁定端口,防止在单个机器上启动多个实列的时候,要到模型启动的时候才能
371+ # 捕获到端口设置冲突的问题
372+ ports_locker = PortLocker (already_uesd_ports )
373+ ports_locker .lock_port ()
374+
375+ node_world_size = args .tp // args .nnodes
376+ can_use_ports = alloc_can_use_network_port (num = 4 + node_world_size , used_nccl_ports = already_uesd_ports )
377+ logger .info (f"alloced ports: { can_use_ports } " )
378+ (
379+ router_port ,
380+ detokenization_port ,
381+ detokenization_pub_port ,
382+ metric_port ,
383+ ) = can_use_ports [0 :4 ]
384+ can_use_ports = can_use_ports [4 :]
385+
386+ # 将申请好的端口放入args参数中
387+ args .router_port = router_port
388+ args .detokenization_port = detokenization_port
389+ args .detokenization_pub_port = detokenization_pub_port
390+ args .metric_port = metric_port
391+
392+ # 申请在 p d 分离模式下,会用的端口
393+ args .pd_node_infer_rpyc_ports = can_use_ports [0 :node_world_size ]
394+ # p d 分离模式下用于标识节点的id
395+ args .pd_node_id = uuid .uuid4 ().int
396+ # p 节点用来建立torch kv 传输分布组的可用端口范围
397+ args .pd_p_allowed_port_min = 20000
398+ args .pd_p_allowed_port_max = 30000
399+
400+ # p d 分离模式下,decode节点的调度间隙是0
401+ if args .run_mode == "decode" :
402+ args .router_max_wait_tokens = 0
403+
404+ send_and_receive_node_ip (args ) # 多机用于收发node ip
405+ set_env_start_args (args )
406+ logger .info (f"all start args:{ args } " )
407+
408+ ports_locker .release_port ()
409+
410+ process_manager .start_submodule_processes (
411+ start_funcs = [
412+ start_metric_manager ,
413+ ],
414+ start_args = [(metric_port , args )],
415+ )
416+
417+ process_manager .start_submodule_processes (
418+ start_funcs = [start_router_process , start_detokenization_process ],
419+ start_args = [
420+ (args , router_port , detokenization_port , metric_port ),
421+ (args , detokenization_port , detokenization_pub_port ),
422+ ],
423+ )
424+
425+ # 启动 gunicorn
426+ command = [
427+ "gunicorn" ,
428+ "--workers" ,
429+ f"{ args .httpserver_workers } " ,
430+ "--worker-class" ,
431+ "uvicorn.workers.UvicornWorker" ,
432+ "--bind" ,
433+ f"{ args .host } :{ args .port } " ,
434+ "--log-level" ,
435+ "info" ,
436+ "--access-logfile" ,
437+ "-" ,
438+ "--error-logfile" ,
439+ "-" ,
440+ "lightllm.server.api_http:app" ,
441+ "--timeout" ,
442+ f"{ get_lightllm_gunicorn_time_out_seconds ()} " ,
443+ "--keep-alive" ,
444+ f"{ get_lightllm_gunicorn_keep_alive ()} " ,
445+ ]
446+
447+ # 启动子进程
448+ http_server_process = subprocess .Popen (command )
449+
450+ if "s3://" in args .model_dir :
451+ from lightllm .utils .petrel_helper import s3_model_clear
452+
453+ s3_model_clear (args .model_dir )
454+
455+ if args .health_monitor :
456+ from lightllm .server .health_monitor .manager import start_health_check_process
457+
458+ process_manager .start_submodule_processes (start_funcs = [start_health_check_process ], start_args = [(args ,)])
459+ setup_signal_handlers (http_server_process , process_manager )
460+ http_server_process .wait ()
461+ return
462+
463+
365464def pd_master_start (args ):
366465 set_unique_server_name (args )
367466 if args .run_mode != "pd_master" :
0 commit comments