Skip to content

Commit 9aaf63b

Browse files
committed
0820
1 parent 68dd163 commit 9aaf63b

File tree

6 files changed

+131
-9
lines changed

6 files changed

+131
-9
lines changed

lightllm/server/api_http.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,21 @@ def set_args(self, args):
100100
visual_port=args.visual_port,
101101
# metric_port=args.metric_port,
102102
)
103+
elif args.run_mode == "llm_only":
104+
init_tokenizer(args) # for openai api
105+
SamplingParams.load_generation_cfg(args.model_dir)
106+
self.metric_client = MetricClient(args.metric_port)
107+
self.httpserver_manager = HttpServerManager(
108+
args,
109+
router_port=args.router_port,
110+
cache_port=None,
111+
detokenization_pub_port=args.detokenization_pub_port,
112+
visual_port=None,
113+
enable_multimodal=args.enable_multimodal,
114+
metric_port=args.metric_port,
115+
)
116+
dp_size_in_node = max(1, args.dp // args.nnodes) # 兼容多机纯tp的运行模式,这时候 1 // 2 == 0, 需要兼容
117+
self.shared_token_load = TokenLoad(f"{get_unique_server_name()}_shared_token_load", dp_size_in_node)
103118
else:
104119
init_tokenizer(args) # for openai api
105120
SamplingParams.load_generation_cfg(args.model_dir)

lightllm/server/api_server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
torch.multiprocessing.set_start_method("spawn") # this code will not be ok for settings to fork to subprocess
66
parser = make_argument_parser()
77
args = parser.parse_args()
8-
from .api_start import pd_master_start, normal_or_p_d_start, visual_only_start, config_server_start
8+
from .api_start import pd_master_start, normal_or_p_d_start, visual_only_start, config_server_start, llm_only_start
99

1010
if args.run_mode == "pd_master":
1111
pd_master_start(args)
1212
elif args.run_mode == "config_server":
1313
config_server_start(args)
1414
elif args.run_mode == "visual_only":
1515
visual_only_start(args)
16+
elif args.run_mode == "llm_only":
17+
llm_only_start(args)
1618
else:
1719
normal_or_p_d_start(args)

lightllm/server/api_start.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,105 @@ def normal_or_p_d_start(args):
362362
return
363363

364364

365+
def llm_only_start(args):
366+
367+
check_and_set_args(args)
368+
already_uesd_ports = [args.nccl_port, args.port]
369+
370+
# 提前锁定端口,防止在单个机器上启动多个实列的时候,要到模型启动的时候才能
371+
# 捕获到端口设置冲突的问题
372+
ports_locker = PortLocker(already_uesd_ports)
373+
ports_locker.lock_port()
374+
375+
node_world_size = args.tp // args.nnodes
376+
can_use_ports = alloc_can_use_network_port(num=4 + node_world_size, used_nccl_ports=already_uesd_ports)
377+
logger.info(f"alloced ports: {can_use_ports}")
378+
(
379+
router_port,
380+
detokenization_port,
381+
detokenization_pub_port,
382+
metric_port,
383+
) = can_use_ports[0:4]
384+
can_use_ports = can_use_ports[4:]
385+
386+
# 将申请好的端口放入args参数中
387+
args.router_port = router_port
388+
args.detokenization_port = detokenization_port
389+
args.detokenization_pub_port = detokenization_pub_port
390+
args.metric_port = metric_port
391+
392+
# 申请在 p d 分离模式下,会用的端口
393+
args.pd_node_infer_rpyc_ports = can_use_ports[0:node_world_size]
394+
# p d 分离模式下用于标识节点的id
395+
args.pd_node_id = uuid.uuid4().int
396+
# p 节点用来建立torch kv 传输分布组的可用端口范围
397+
args.pd_p_allowed_port_min = 20000
398+
args.pd_p_allowed_port_max = 30000
399+
400+
# p d 分离模式下,decode节点的调度间隙是0
401+
if args.run_mode == "decode":
402+
args.router_max_wait_tokens = 0
403+
404+
send_and_receive_node_ip(args) # 多机用于收发node ip
405+
set_env_start_args(args)
406+
logger.info(f"all start args:{args}")
407+
408+
ports_locker.release_port()
409+
410+
process_manager.start_submodule_processes(
411+
start_funcs=[
412+
start_metric_manager,
413+
],
414+
start_args=[(metric_port, args)],
415+
)
416+
417+
process_manager.start_submodule_processes(
418+
start_funcs=[start_router_process, start_detokenization_process],
419+
start_args=[
420+
(args, router_port, detokenization_port, metric_port),
421+
(args, detokenization_port, detokenization_pub_port),
422+
],
423+
)
424+
425+
# 启动 gunicorn
426+
command = [
427+
"gunicorn",
428+
"--workers",
429+
f"{args.httpserver_workers}",
430+
"--worker-class",
431+
"uvicorn.workers.UvicornWorker",
432+
"--bind",
433+
f"{args.host}:{args.port}",
434+
"--log-level",
435+
"info",
436+
"--access-logfile",
437+
"-",
438+
"--error-logfile",
439+
"-",
440+
"lightllm.server.api_http:app",
441+
"--timeout",
442+
f"{get_lightllm_gunicorn_time_out_seconds()}",
443+
"--keep-alive",
444+
f"{get_lightllm_gunicorn_keep_alive()}",
445+
]
446+
447+
# 启动子进程
448+
http_server_process = subprocess.Popen(command)
449+
450+
if "s3://" in args.model_dir:
451+
from lightllm.utils.petrel_helper import s3_model_clear
452+
453+
s3_model_clear(args.model_dir)
454+
455+
if args.health_monitor:
456+
from lightllm.server.health_monitor.manager import start_health_check_process
457+
458+
process_manager.start_submodule_processes(start_funcs=[start_health_check_process], start_args=[(args,)])
459+
setup_signal_handlers(http_server_process, process_manager)
460+
http_server_process.wait()
461+
return
462+
463+
365464
def pd_master_start(args):
366465
set_unique_server_name(args)
367466
if args.run_mode != "pd_master":

lightllm/server/embed_cache/impl/naive_memory_cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def _check_and_set_new_id_range(self, alloced_token_num):
6565
except BaseException as e:
6666
logger.exception(str(e))
6767
time.sleep(3)
68-
return self.token_id_range_start
68+
return
6969

7070
def _clear(self, free_max_count: int):
7171
deleted = 0

lightllm/server/httpserver/manager.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,13 @@ def __init__(
8181
)
8282

8383
self.enable_multimodal = enable_multimodal
84-
if self.enable_multimodal:
84+
if self.enable_multimodal and self.args.run_mode != "llm_only":
8585
self.cache_client = rpyc.connect("localhost", cache_port, config={"allow_pickle": True})
86-
if self.args.run_mode != "llm_only":
87-
self.send_to_visual = context.socket(zmq.PUSH)
88-
self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{visual_port}")
86+
self.send_to_visual = context.socket(zmq.PUSH)
87+
self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{visual_port}")
88+
89+
self.token_id_range_start = 100000000
90+
self.token_id_range_end = 2 ** 63 - 1
8991

9092
self.shm_req_manager = ShmReqManager()
9193

@@ -115,6 +117,10 @@ def __init__(
115117
self.latest_success_infer_time_mark.set_value(int(time.time()))
116118
return
117119

120+
async def _check_and_set_new_id_range(self, token_num):
121+
assert self.token_id_range_start + token_num < self.token_id_range_end
122+
self.token_id_range_start += token_num
123+
118124
async def _alloc_resource(self, items, md5sums, token_nums, datas):
119125

120126
while True:
@@ -199,7 +205,7 @@ async def _get_image_embedding_from_afs(self, multimodal_params: MultimodalParam
199205
await self._wait_for_afs_embed(md5sum)
200206
img.uuid = uid_int
201207
img.afs_embed = True
202-
token_id_range_start = self.cache_client.root._check_and_set_new_id_range(token_num)
208+
token_id_range_start = self.token_id_range_start
203209
img.token_id = token_id_range_start
204210
img.token_num = token_num
205211

@@ -216,7 +222,7 @@ async def _get_image_embedding_from_afs(self, multimodal_params: MultimodalParam
216222
uid_int = int(md5sum, 16)
217223
audio.uuid = uid_int
218224
audio.afs_embed = True
219-
token_id_range_start = self.cache_client.root._check_and_set_new_id_range(token_num)
225+
token_id_range_start = self.token_id_range_start
220226
audio.token_id = token_id_range_start
221227
audio.token_num = token_num
222228
return

lightllm/server/httpserver_for_visual_only/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ async def generate(
238238
req_obj.init(
239239
group_request_id + i,
240240
# 随便写的,后面改掉
241-
[24, 67],
241+
[21456],
242242
sampling_params,
243243
self.tokenizer,
244244
chunked_prefill_size=self.args.chunked_prefill_size,

0 commit comments

Comments
 (0)