Skip to content

Commit 725f592

Browse files
committed
Merge branch 'main' into vit_fix
2 parents 7e1119f + 3641b10 commit 725f592

File tree

5 files changed

+385
-4
lines changed

5 files changed

+385
-4
lines changed

lightllm/common/basemodel/basemodel.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ def __init__(self, kvargs):
5858
self.use_dynamic_prompt_cache = kvargs.get("use_dynamic_prompt_cache", False)
5959
self.data_type = kvargs.get("data_type", "float16")
6060
self.graph_max_batch_size = kvargs.get("graph_max_batch_size", 16)
61+
self.graph_max_batch_size = (
62+
self.graph_max_batch_size // 2
63+
if get_env_start_args().enable_decode_microbatch_overlap
64+
else self.graph_max_batch_size
65+
)
6166
self.graph_max_len_in_batch = kvargs.get("graph_max_len_in_batch", 8192)
6267
self.disable_cudagraph = kvargs.get("disable_cudagraph", False)
6368
self.quant_type = kvargs.get("quant_type", "none")

lightllm/server/api_start.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .visualserver.manager import start_visual_process
1212
from lightllm.utils.log_utils import init_logger
1313
from lightllm.utils.envs_utils import set_env_start_args, set_unique_server_name, get_unique_server_name
14-
from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds
14+
from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds, get_lightllm_gunicorn_keep_alive
1515
from .detokenization.manager import start_detokenization_process
1616
from .router.manager import start_router_process
1717
from lightllm.utils.process_check import is_process_active
@@ -260,6 +260,8 @@ def normal_or_p_d_start(args):
260260
"lightllm.server.api_http:app",
261261
"--timeout",
262262
f"{get_lightllm_gunicorn_time_out_seconds()}",
263+
"--keep-alive",
264+
f"{get_lightllm_gunicorn_keep_alive()}",
263265
]
264266

265267
# 启动子进程
@@ -327,6 +329,8 @@ def pd_master_start(args):
327329
"lightllm.server.api_http:app",
328330
"--timeout",
329331
f"{get_lightllm_gunicorn_time_out_seconds()}",
332+
"--keep-alive",
333+
f"{get_lightllm_gunicorn_keep_alive()}",
330334
]
331335

332336
http_server_process = subprocess.Popen(command)
@@ -367,6 +371,8 @@ def config_server_start(args):
367371
"lightllm.server.config_server.api_http:app",
368372
"--timeout",
369373
f"{get_lightllm_gunicorn_time_out_seconds()}",
374+
"--keep-alive",
375+
f"{get_lightllm_gunicorn_keep_alive()}",
370376
]
371377

372378
http_server_process = subprocess.Popen(command)

lightllm/utils/envs_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,18 @@ def enable_env_vars(args):
5454

5555
@lru_cache(maxsize=None)
5656
def get_deepep_num_max_dispatch_tokens_per_rank():
57+
# 该参数需要大于单卡最大batch size,且是8的倍数。该参数与显存占用直接相关,值越大,显存占用越大,如果出现显存不足,可以尝试调小该值
5758
return int(os.getenv("NUM_MAX_DISPATCH_TOKENS_PER_RANK", 256))
5859

5960

6061
def get_lightllm_gunicorn_time_out_seconds():
6162
return int(os.getenv("LIGHTLMM_GUNICORN_TIME_OUT", 180))
6263

6364

65+
def get_lightllm_gunicorn_keep_alive():
66+
return int(os.getenv("LIGHTLMM_GUNICORN_KEEP_ALIVE", 10))
67+
68+
6469
@lru_cache(maxsize=None)
6570
def get_lightllm_websocket_max_message_size():
6671
"""

0 commit comments

Comments
 (0)