ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 5 additions & 0 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lightllm/server/api_start.py‎
Lines changed: 7 additions & 1 deletion b/‎lightllm/server/api_start.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎lightllm/utils/envs_utils.py‎
Lines changed: 5 additions & 0 deletions b/‎lightllm/utils/envs_utils.py‎
Lines changed: 5 additions & 0 deletions
@@ -58,6 +58,11 @@ def __init__(self, kvargs):
         self.use_dynamic_prompt_cache = kvargs.get("use_dynamic_prompt_cache", False)
         self.data_type = kvargs.get("data_type", "float16")
         self.graph_max_batch_size = kvargs.get("graph_max_batch_size", 16)
+        self.graph_max_batch_size = (
+            self.graph_max_batch_size // 2
+            if get_env_start_args().enable_decode_microbatch_overlap
+            else self.graph_max_batch_size
+        )
         self.graph_max_len_in_batch = kvargs.get("graph_max_len_in_batch", 8192)
         self.disable_cudagraph = kvargs.get("disable_cudagraph", False)
         self.quant_type = kvargs.get("quant_type", "none")
 
@@ -11,7 +11,7 @@
 from .visualserver.manager import start_visual_process
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import set_env_start_args, set_unique_server_name, get_unique_server_name
-from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds
+from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds, get_lightllm_gunicorn_keep_alive
 from .detokenization.manager import start_detokenization_process
 from .router.manager import start_router_process
 from lightllm.utils.process_check import is_process_active
@@ -260,6 +260,8 @@ def normal_or_p_d_start(args):
         "lightllm.server.api_http:app",
         "--timeout",
         f"{get_lightllm_gunicorn_time_out_seconds()}",
+        "--keep-alive",
+        f"{get_lightllm_gunicorn_keep_alive()}",
     ]
 
     # 启动子进程
@@ -327,6 +329,8 @@ def pd_master_start(args):
         "lightllm.server.api_http:app",
         "--timeout",
         f"{get_lightllm_gunicorn_time_out_seconds()}",
+        "--keep-alive",
+        f"{get_lightllm_gunicorn_keep_alive()}",
     ]
 
     http_server_process = subprocess.Popen(command)
@@ -367,6 +371,8 @@ def config_server_start(args):
         "lightllm.server.config_server.api_http:app",
         "--timeout",
         f"{get_lightllm_gunicorn_time_out_seconds()}",
+        "--keep-alive",
+        f"{get_lightllm_gunicorn_keep_alive()}",
     ]
 
     http_server_process = subprocess.Popen(command)
 
@@ -54,13 +54,18 @@ def enable_env_vars(args):
 
 @lru_cache(maxsize=None)
 def get_deepep_num_max_dispatch_tokens_per_rank():
+    # 该参数需要大于单卡最大batch size，且是8的倍数。该参数与显存占用直接相关，值越大，显存占用越大，如果出现显存不足，可以尝试调小该值
     return int(os.getenv("NUM_MAX_DISPATCH_TOKENS_PER_RANK", 256))
 
 
 def get_lightllm_gunicorn_time_out_seconds():
     return int(os.getenv("LIGHTLMM_GUNICORN_TIME_OUT", 180))
 
 
+def get_lightllm_gunicorn_keep_alive():
+    return int(os.getenv("LIGHTLMM_GUNICORN_KEEP_ALIVE", 10))
+
+
 @lru_cache(maxsize=None)
 def get_lightllm_websocket_max_message_size():
     """