Skip to content

Commit 9cf8751

Browse files
fix shm check
1 parent 4ebea76 commit 9cf8751

File tree

2 files changed

+130
-43
lines changed

2 files changed

+130
-43
lines changed

lightllm/server/api_start.py

Lines changed: 10 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,41 +16,11 @@
1616
from .router.manager import start_router_process
1717
from lightllm.utils.process_check import is_process_active
1818
from lightllm.utils.multinode_utils import send_and_receive_node_ip
19+
from lightllm.utils.shm_size_check import check_shm_size, start_shm_size_warning_thread
1920

2021
logger = init_logger(__name__)
2122

2223

23-
def get_shm_size_gb():
24-
"""
25-
获取 /dev/shm 的总大小(以GB为单位)。
26-
"""
27-
try:
28-
shm_path = "/dev/shm"
29-
if not os.path.exists(shm_path):
30-
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
31-
return 0
32-
33-
# shutil.disk_usage 返回 (total, used, free)
34-
total_bytes = shutil.disk_usage(shm_path).total
35-
total_gb = total_bytes / (1024 ** 3)
36-
return total_gb
37-
except Exception as e:
38-
logger.error(f"Error getting /dev/shm size: {e}")
39-
return 0
40-
41-
42-
def check_shm_size():
43-
RED = "\033[91m"
44-
GREEN = "\033[92m"
45-
ENDC = "\033[0m"
46-
shm_size = get_shm_size_gb()
47-
required_size = 128 # 128G
48-
if shm_size < required_size:
49-
logger.warning(f"{RED}Available shm size is less than 128G: {shm_size:.2f}G{ENDC}")
50-
else:
51-
logger.info(f"{GREEN}/dev/shm available space is sufficient ({shm_size:.2f} GB >= {required_size} GB).{ENDC}")
52-
53-
5424
def setup_signal_handlers(http_server_process, process_manager):
5525
def signal_handler(sig, frame):
5626
if sig == signal.SIGINT:
@@ -94,18 +64,15 @@ def signal_handler(sig, frame):
9464
def normal_or_p_d_start(args):
9565
set_unique_server_name(args)
9666

97-
check_shm_size()
98-
99-
if not args.disable_shm_warning:
100-
import threading
101-
102-
def periodic_shm_warning():
103-
while True:
104-
check_shm_size()
105-
time.sleep(120) # 每 120 秒打印一次警告日志
106-
107-
shm_warning_thread = threading.Thread(target=periodic_shm_warning, daemon=True)
108-
shm_warning_thread.start()
67+
shm_size, require_shm_size, is_shm_sufficient = check_shm_size(args)
68+
if not args.disable_shm_warning and not is_shm_sufficient:
69+
start_shm_size_warning_thread(shm_size, require_shm_size)
70+
else:
71+
logger.info(
72+
f"SHM check: Available={shm_size:.2f} GB,",
73+
f"Required={require_shm_size:.2f} GB.",
74+
f"Sufficient: {is_shm_sufficient}",
75+
)
10976

11077
if args.enable_mps:
11178
from lightllm.utils.device_utils import enable_mps

lightllm/utils/shm_size_check.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import ctypes
2+
import os
3+
import shutil
4+
import time
5+
import threading
6+
import psutil
7+
import signal
8+
from lightllm.server.core.objs.out_token_circlequeue import LIGHTLLM_OUT_TOKEN_QUEUE_SIZE, LIGHTLLM_TOKEN_MAX_BYTES
9+
from lightllm.server.core.objs.req import ChunkedPrefillReq, TokenHealingReq
10+
from lightllm.server.tokenizer import get_tokenizer
11+
from lightllm.utils.config_utils import get_config_json
12+
from lightllm.utils.log_utils import init_logger
13+
from transformers import AutoTokenizer
14+
15+
logger = init_logger(__name__)
16+
17+
18+
def get_shm_size_gb():
19+
"""
20+
获取 /dev/shm 的总大小(以GB为单位)。
21+
"""
22+
try:
23+
shm_path = "/dev/shm"
24+
if not os.path.exists(shm_path):
25+
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
26+
return 0
27+
28+
# shutil.disk_usage 返回 (total, used, free)
29+
total_bytes = shutil.disk_usage(shm_path).total
30+
total_gb = total_bytes / (1024 ** 3)
31+
return total_gb
32+
except Exception as e:
33+
logger.error(f"Error getting /dev/shm size: {e}")
34+
return 0
35+
36+
37+
def get_required_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_size=2):
38+
"""
39+
获取所需的 /dev/shm 大小(以GB为单位)。
40+
"""
41+
model_config = get_config_json(args.model_dir)
42+
tokenizer = get_tokenizer(args.model_dir, trust_remote_code=True)
43+
44+
if not args.enable_multimodal:
45+
# by default, 非多模态 24 GB
46+
total_required_size_gb = 24
47+
else:
48+
num_channels = 3
49+
image_width, image_height = max_image_resolution
50+
image_size_bytes = image_width * image_height * num_channels
51+
52+
# 假设加载最大分辨率图片时,通过 tokenizer 得到最多的 image_tokens
53+
if not hasattr(tokenizer, "get_image_token_length"):
54+
raise AttributeError("Tokenizer must have a 'get_image_token_length' method for multimodal models.")
55+
max_image_tokens = tokenizer.get_image_token_length(None)
56+
57+
# 估算图片 token 所需的资源
58+
hidden_size = model_config.get("hidden_size")
59+
if hidden_size is None:
60+
logger.warning("Model config not contain 'hidden_size', using 4096 by default.")
61+
image_token_size_bytes = max_image_tokens * 4096 * dtype_size
62+
else:
63+
image_token_size_bytes = max_image_tokens * hidden_size * dtype_size
64+
65+
# 估算Req所需的shm大小
66+
if args.token_healing_mode:
67+
req_class_size = ctypes.sizeof(TokenHealingReq)
68+
else:
69+
req_class_size = ctypes.sizeof(ChunkedPrefillReq)
70+
req_shm_size_bytes = req_class_size * args.running_max_req_size
71+
72+
# 估算OutTokenQueue所需shm大小
73+
out_token_queue_size_bytes = LIGHTLLM_TOKEN_MAX_BYTES * LIGHTLLM_OUT_TOKEN_QUEUE_SIZE
74+
75+
total_required_size = (
76+
args.cache_capacity * (image_size_bytes + image_token_size_bytes)
77+
+ req_shm_size_bytes
78+
+ out_token_queue_size_bytes
79+
)
80+
81+
total_required_size_gb = total_required_size / (1024 ** 3) + 2
82+
83+
return total_required_size_gb
84+
85+
86+
def check_shm_size(args):
87+
RED = "\033[91m"
88+
ENDC = "\033[0m"
89+
shm_size = get_shm_size_gb()
90+
required_size = get_required_shm_size_gb(args) # 128G
91+
if shm_size < required_size:
92+
logger.warning(f"{RED}Available shm size is less than 128G: {shm_size:.2f}G{ENDC}")
93+
return shm_size, required_size, False
94+
else: # shm_size >= required_size
95+
return shm_size, required_size, True
96+
97+
98+
def periodic_shm_warning(shm_size, required_shm_size):
99+
RED = "\033[91m"
100+
ENDC = "\033[0m"
101+
while True:
102+
logger.warning(
103+
f"{RED}Insufficient shared memory (SHM) available.",
104+
f"Required: {required_shm_size:.2f}G, Available: {shm_size:.2f}G.\n",
105+
"If running in Docker, you can increase SHM size with the `--shm-size` flag, ",
106+
f"like so: `docker run --shm-size=30g [your_image]`{ENDC}",
107+
)
108+
time.sleep(120) # 每 120 秒打印一次警告日志
109+
110+
111+
def start_shm_size_warning_thread(shm_size, required_shm_size):
112+
shm_warning_thread = threading.Thread(
113+
target=periodic_shm_warning,
114+
args=(
115+
shm_size,
116+
required_shm_size,
117+
),
118+
daemon=True,
119+
)
120+
shm_warning_thread.start()

0 commit comments

Comments
 (0)