Add shm size check (#978)

flyinglandlord · hiworldwzj · web-flow · commit 81b9ecb791cc · 2025-08-18T20:14:07.000+08:00
New Startup Command: `--disable-shm-warning`. 

By default, LightLLM will now periodically check the available SHM size
**every 120 seconds** and log warnings if it's below the recommended
threshold (128GB).

If `--disable-shm-warning` is set, LightLLM will perform a one-time SHM
size check only during startup, and then disable further periodic
checks.

---------

Co-authored-by: wangzaijun &lt;wzjhelloworld@qq.com&gt;
Co-authored-by: hiworldwzj &lt;30762946+hiworldwzj@users.noreply.github.com&gt;
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -186,6 +186,15 @@ def make_argument_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.")
     parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.")
+    parser.add_argument(
+        "--disable_shm_warning",
+        action="store_true",
+        default=False,
+        help="""Disable periodic shared memory (/dev/shm) warning logs.
+        Our system requires sufficient available shared memory in /dev/shm,
+        so a monitoring thread is enabled to check if the capacity is adequate.
+        This setting allows you to turn off these warning checks.""",
+    )
 
     parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch")
     parser.add_argument(
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -15,6 +15,7 @@
 from .router.manager import start_router_process
 from lightllm.utils.process_check import is_process_active
 from lightllm.utils.multinode_utils import send_and_receive_node_ip
+from lightllm.utils.shm_size_check import check_recommended_shm_size
 
 logger = init_logger(__name__)
 
@@ -62,6 +63,9 @@ def signal_handler(sig, frame):
 def normal_or_p_d_start(args):
     set_unique_server_name(args)
 
+    if not args.disable_shm_warning:
+        check_recommended_shm_size(args)
+
     if args.enable_mps:
         from lightllm.utils.device_utils import enable_mps
 
diff --git a/lightllm/utils/config_utils.py b/lightllm/utils/config_utils.py
@@ -1,5 +1,6 @@
 import json
 import os
+from typing import Optional
 from functools import lru_cache
 from .envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
@@ -13,6 +14,25 @@ def get_config_json(model_path: str):
     return json_obj
 
 
+def get_hidden_size(model_path: str) -> Optional[int]:
+    # try to get hidden_size in config.json
+    config_json = get_config_json(model_path)
+    try:
+        hidden_size = config_json["hidden_size"]
+    except:
+        # for some multimodal model
+        try:
+            hidden_size = config_json["llm_config"]["hidden_size"]
+        except:
+            hidden_size = config_json.get("text_config", {}).get("hidden_size")
+
+    if isinstance(hidden_size, int):
+        return hidden_size
+
+    logger.error("cannot get hidden size from config.json, return None instead")
+    return None
+
+
 def get_eos_token_ids(model_path: str):
     config_json = get_config_json(model_path)
     try:
diff --git a/lightllm/utils/shm_size_check.py b/lightllm/utils/shm_size_check.py
@@ -0,0 +1,137 @@
+import ctypes
+import os
+import shutil
+import time
+import threading
+from lightllm.server.core.objs.req import ChunkedPrefillReq, TokenHealingReq
+from lightllm.server.multimodal_params import ImageItem
+from lightllm.server.tokenizer import get_tokenizer
+from lightllm.utils.config_utils import get_hidden_size
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+def check_recommended_shm_size(args):
+    shm_size, recommended_shm_size, is_shm_sufficient = _check_shm_size(args)
+    if not is_shm_sufficient:
+        _start_shm_size_warning_thread(shm_size, recommended_shm_size)
+    else:
+        logger.info(
+            f"SHM check: Available={shm_size:.2f} GB,"
+            f"Recommended={recommended_shm_size:.2f} GB."
+            f"Sufficient: {is_shm_sufficient}",
+        )
+
+
+def _check_shm_size(args):
+    RED = "\033[91m"
+    ENDC = "\033[0m"
+    shm_size = _get_system_shm_size_gb()
+    required_size = _get_recommended_shm_size_gb(args)
+    if shm_size < required_size:
+        logger.warning(f"{RED}Available shm size {shm_size:.2f}G is less than required_size {required_size:.2f}G{ENDC}")
+        return shm_size, required_size, False
+    else:  # shm_size >= required_size
+        return shm_size, required_size, True
+
+
+def _start_shm_size_warning_thread(shm_size, required_shm_size):
+    def _periodic_shm_warning(shm_size, required_shm_size):
+        RED = "\033[91m"
+        ENDC = "\033[0m"
+        while True:
+            logger.warning(
+                f"{RED}Insufficient shared memory (SHM) available."
+                f"Required: {required_shm_size:.2f}G, Available: {shm_size:.2f}G.\n"
+                "If running in Docker, you can increase SHM size with the `--shm-size` flag, "
+                f"like so: `docker run --shm-size=30g [your_image]`{ENDC}",
+            )
+            time.sleep(120)  # 每 120 秒打印一次警告日志
+
+    shm_warning_thread = threading.Thread(
+        target=_periodic_shm_warning,
+        args=(
+            shm_size,
+            required_shm_size,
+        ),
+        daemon=True,
+    )
+    shm_warning_thread.start()
+
+
+def _get_system_shm_size_gb():
+    """
+    获取 /dev/shm 的总大小(以GB为单位)。
+    """
+    try:
+        shm_path = "/dev/shm"
+        if not os.path.exists(shm_path):
+            logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
+            return 0
+
+        # shutil.disk_usage 返回 (total, used, free)
+        total_bytes = shutil.disk_usage(shm_path).total
+        total_gb = total_bytes / (1024 ** 3)
+        return total_gb
+    except Exception as e:
+        logger.error(f"Error getting /dev/shm size: {e}")
+        return 0
+
+
+def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_size=2):
+    """
+    获取所需的 /dev/shm 大小(以GB为单位)。
+    """
+    tokenizer = get_tokenizer(args.model_dir, trust_remote_code=True)
+
+    # 估算input_token和logprob占用shm大小，由于是double和int64，所以固定占用8个字节
+    input_token_logprob_size_bytes = args.running_max_req_size * 8 * 2 * args.max_req_total_len
+
+    # 估算Req所需的shm大小
+    if args.token_healing_mode:
+        req_class_size = ctypes.sizeof(TokenHealingReq)
+    else:
+        req_class_size = ctypes.sizeof(ChunkedPrefillReq)
+    req_shm_size_bytes = req_class_size * args.running_max_req_size
+
+    if not args.enable_multimodal:
+        total_recommended_shm_size_gb = (req_shm_size_bytes + input_token_logprob_size_bytes) / (1024 ** 3) + 2
+    else:
+        # 存储图片数据所需的shm大小
+        num_channels = 3
+        image_width, image_height = max_image_resolution
+        image_size_bytes = image_width * image_height * num_channels
+
+        # 假设加载最大分辨率图片时，通过 tokenizer 得到最多的 image_tokens
+        if not hasattr(tokenizer, "get_image_token_length"):
+            logger.error("Tokenizer must have a 'get_image_token_length' method for multimodal models.")
+            return float("inf")
+
+        fake_image_item = ImageItem(
+            type="image_size",
+            data=max_image_resolution,
+        )
+        fake_image_item.image_w = fake_image_item._data[0]
+        fake_image_item.image_h = fake_image_item._data[1]
+        max_image_tokens = tokenizer.get_image_token_length(fake_image_item)
+
+        # 估算图片 token 所需的资源
+        hidden_size = get_hidden_size(args.model_dir)
+        if hidden_size is None:
+            logger.warning(
+                "Model config not contain 'hidden_size', " "using 4096 by default to calculate recommended shm size."
+            )
+            image_token_size_bytes = max_image_tokens * 4096 * dtype_size
+        else:
+            image_token_size_bytes = max_image_tokens * hidden_size * dtype_size
+
+        total_recommended_shm_size_gb = (
+            args.cache_capacity * (image_size_bytes + image_token_size_bytes)
+            + req_shm_size_bytes
+            + input_token_logprob_size_bytes
+        )
+
+        total_recommended_shm_size_gb = total_recommended_shm_size_gb / (1024 ** 3) + 2
+
+    return total_recommended_shm_size_gb