Skip to content

Commit 81b9ecb

Browse files
Add shm size check (#978)
New Startup Command: `--disable-shm-warning`. By default, LightLLM will now periodically check the available SHM size **every 120 seconds** and log warnings if it's below the recommended threshold (128GB). If `--disable-shm-warning` is set, LightLLM will perform a one-time SHM size check only during startup, and then disable further periodic checks. --------- Co-authored-by: wangzaijun <[email protected]> Co-authored-by: hiworldwzj <[email protected]>
1 parent df6afff commit 81b9ecb

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed

lightllm/server/api_cli.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,15 @@ def make_argument_parser() -> argparse.ArgumentParser:
186186
)
187187
parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.")
188188
parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.")
189+
parser.add_argument(
190+
"--disable_shm_warning",
191+
action="store_true",
192+
default=False,
193+
help="""Disable periodic shared memory (/dev/shm) warning logs.
194+
Our system requires sufficient available shared memory in /dev/shm,
195+
so a monitoring thread is enabled to check if the capacity is adequate.
196+
This setting allows you to turn off these warning checks.""",
197+
)
189198

190199
parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch")
191200
parser.add_argument(

lightllm/server/api_start.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .router.manager import start_router_process
1616
from lightllm.utils.process_check import is_process_active
1717
from lightllm.utils.multinode_utils import send_and_receive_node_ip
18+
from lightllm.utils.shm_size_check import check_recommended_shm_size
1819

1920
logger = init_logger(__name__)
2021

@@ -62,6 +63,9 @@ def signal_handler(sig, frame):
6263
def normal_or_p_d_start(args):
6364
set_unique_server_name(args)
6465

66+
if not args.disable_shm_warning:
67+
check_recommended_shm_size(args)
68+
6569
if args.enable_mps:
6670
from lightllm.utils.device_utils import enable_mps
6771

lightllm/utils/config_utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import os
3+
from typing import Optional
34
from functools import lru_cache
45
from .envs_utils import get_env_start_args
56
from lightllm.utils.log_utils import init_logger
@@ -13,6 +14,25 @@ def get_config_json(model_path: str):
1314
return json_obj
1415

1516

17+
def get_hidden_size(model_path: str) -> Optional[int]:
18+
# try to get hidden_size in config.json
19+
config_json = get_config_json(model_path)
20+
try:
21+
hidden_size = config_json["hidden_size"]
22+
except:
23+
# for some multimodal model
24+
try:
25+
hidden_size = config_json["llm_config"]["hidden_size"]
26+
except:
27+
hidden_size = config_json.get("text_config", {}).get("hidden_size")
28+
29+
if isinstance(hidden_size, int):
30+
return hidden_size
31+
32+
logger.error("cannot get hidden size from config.json, return None instead")
33+
return None
34+
35+
1636
def get_eos_token_ids(model_path: str):
1737
config_json = get_config_json(model_path)
1838
try:

lightllm/utils/shm_size_check.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import ctypes
2+
import os
3+
import shutil
4+
import time
5+
import threading
6+
from lightllm.server.core.objs.req import ChunkedPrefillReq, TokenHealingReq
7+
from lightllm.server.multimodal_params import ImageItem
8+
from lightllm.server.tokenizer import get_tokenizer
9+
from lightllm.utils.config_utils import get_hidden_size
10+
from lightllm.utils.log_utils import init_logger
11+
12+
logger = init_logger(__name__)
13+
14+
15+
def check_recommended_shm_size(args):
16+
shm_size, recommended_shm_size, is_shm_sufficient = _check_shm_size(args)
17+
if not is_shm_sufficient:
18+
_start_shm_size_warning_thread(shm_size, recommended_shm_size)
19+
else:
20+
logger.info(
21+
f"SHM check: Available={shm_size:.2f} GB,"
22+
f"Recommended={recommended_shm_size:.2f} GB."
23+
f"Sufficient: {is_shm_sufficient}",
24+
)
25+
26+
27+
def _check_shm_size(args):
28+
RED = "\033[91m"
29+
ENDC = "\033[0m"
30+
shm_size = _get_system_shm_size_gb()
31+
required_size = _get_recommended_shm_size_gb(args)
32+
if shm_size < required_size:
33+
logger.warning(f"{RED}Available shm size {shm_size:.2f}G is less than required_size {required_size:.2f}G{ENDC}")
34+
return shm_size, required_size, False
35+
else: # shm_size >= required_size
36+
return shm_size, required_size, True
37+
38+
39+
def _start_shm_size_warning_thread(shm_size, required_shm_size):
40+
def _periodic_shm_warning(shm_size, required_shm_size):
41+
RED = "\033[91m"
42+
ENDC = "\033[0m"
43+
while True:
44+
logger.warning(
45+
f"{RED}Insufficient shared memory (SHM) available."
46+
f"Required: {required_shm_size:.2f}G, Available: {shm_size:.2f}G.\n"
47+
"If running in Docker, you can increase SHM size with the `--shm-size` flag, "
48+
f"like so: `docker run --shm-size=30g [your_image]`{ENDC}",
49+
)
50+
time.sleep(120) # 每 120 秒打印一次警告日志
51+
52+
shm_warning_thread = threading.Thread(
53+
target=_periodic_shm_warning,
54+
args=(
55+
shm_size,
56+
required_shm_size,
57+
),
58+
daemon=True,
59+
)
60+
shm_warning_thread.start()
61+
62+
63+
def _get_system_shm_size_gb():
64+
"""
65+
获取 /dev/shm 的总大小(以GB为单位)。
66+
"""
67+
try:
68+
shm_path = "/dev/shm"
69+
if not os.path.exists(shm_path):
70+
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
71+
return 0
72+
73+
# shutil.disk_usage 返回 (total, used, free)
74+
total_bytes = shutil.disk_usage(shm_path).total
75+
total_gb = total_bytes / (1024 ** 3)
76+
return total_gb
77+
except Exception as e:
78+
logger.error(f"Error getting /dev/shm size: {e}")
79+
return 0
80+
81+
82+
def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_size=2):
83+
"""
84+
获取所需的 /dev/shm 大小(以GB为单位)。
85+
"""
86+
tokenizer = get_tokenizer(args.model_dir, trust_remote_code=True)
87+
88+
# 估算input_token和logprob占用shm大小,由于是double和int64,所以固定占用8个字节
89+
input_token_logprob_size_bytes = args.running_max_req_size * 8 * 2 * args.max_req_total_len
90+
91+
# 估算Req所需的shm大小
92+
if args.token_healing_mode:
93+
req_class_size = ctypes.sizeof(TokenHealingReq)
94+
else:
95+
req_class_size = ctypes.sizeof(ChunkedPrefillReq)
96+
req_shm_size_bytes = req_class_size * args.running_max_req_size
97+
98+
if not args.enable_multimodal:
99+
total_recommended_shm_size_gb = (req_shm_size_bytes + input_token_logprob_size_bytes) / (1024 ** 3) + 2
100+
else:
101+
# 存储图片数据所需的shm大小
102+
num_channels = 3
103+
image_width, image_height = max_image_resolution
104+
image_size_bytes = image_width * image_height * num_channels
105+
106+
# 假设加载最大分辨率图片时,通过 tokenizer 得到最多的 image_tokens
107+
if not hasattr(tokenizer, "get_image_token_length"):
108+
logger.error("Tokenizer must have a 'get_image_token_length' method for multimodal models.")
109+
return float("inf")
110+
111+
fake_image_item = ImageItem(
112+
type="image_size",
113+
data=max_image_resolution,
114+
)
115+
fake_image_item.image_w = fake_image_item._data[0]
116+
fake_image_item.image_h = fake_image_item._data[1]
117+
max_image_tokens = tokenizer.get_image_token_length(fake_image_item)
118+
119+
# 估算图片 token 所需的资源
120+
hidden_size = get_hidden_size(args.model_dir)
121+
if hidden_size is None:
122+
logger.warning(
123+
"Model config not contain 'hidden_size', " "using 4096 by default to calculate recommended shm size."
124+
)
125+
image_token_size_bytes = max_image_tokens * 4096 * dtype_size
126+
else:
127+
image_token_size_bytes = max_image_tokens * hidden_size * dtype_size
128+
129+
total_recommended_shm_size_gb = (
130+
args.cache_capacity * (image_size_bytes + image_token_size_bytes)
131+
+ req_shm_size_bytes
132+
+ input_token_logprob_size_bytes
133+
)
134+
135+
total_recommended_shm_size_gb = total_recommended_shm_size_gb / (1024 ** 3) + 2
136+
137+
return total_recommended_shm_size_gb

0 commit comments

Comments
 (0)