File tree Expand file tree Collapse file tree 3 files changed +26
-2
lines changed Expand file tree Collapse file tree 3 files changed +26
-2
lines changed Original file line number Diff line number Diff line change 2121from vllm .logger import init_logger
2222from vllm .sampling_params import SamplingParams
2323from vllm .usage .usage_lib import UsageContext
24- from vllm .utils import FlexibleArgumentParser , random_uuid
24+ from vllm .utils import FlexibleArgumentParser , random_uuid , set_ulimit
2525from vllm .version import __version__ as VLLM_VERSION
2626
2727logger = init_logger ("vllm.entrypoints.api_server" )
@@ -119,6 +119,8 @@ async def run_server(args: Namespace,
119119 logger .info ("vLLM API server version %s" , VLLM_VERSION )
120120 logger .info ("args: %s" , args )
121121
122+ set_ulimit ()
123+
122124 app = await init_app (args , llm_engine )
123125 assert engine is not None
124126
Original file line number Diff line number Diff line change 6868from vllm .logger import init_logger
6969from vllm .usage .usage_lib import UsageContext
7070from vllm .utils import (FlexibleArgumentParser , get_open_zmq_ipc_path ,
71- is_valid_ipv6_address )
71+ is_valid_ipv6_address , set_ulimit )
7272from vllm .version import __version__ as VLLM_VERSION
7373
7474TIMEOUT_KEEP_ALIVE = 5 # seconds
@@ -727,6 +727,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
727727 sock_addr = (args .host or "" , args .port )
728728 sock = create_server_socket (sock_addr )
729729
730+ # workaround to avoid footguns where uvicorn drops requests with too
731+ # many concurrent requests active
732+ set_ulimit ()
733+
730734 def signal_handler (* _ ) -> None :
731735 # Interrupt server on sigterm while initializing
732736 raise KeyboardInterrupt ("terminated" )
Original file line number Diff line number Diff line change 1212import ipaddress
1313import os
1414import re
15+ import resource
1516import signal
1617import socket
1718import subprocess
@@ -1818,3 +1819,20 @@ def memory_profiling(
18181819 result .non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff .torch_memory_in_bytes # noqa
18191820 result .profile_time = diff .timestamp
18201821 result .non_kv_cache_memory_in_bytes = result .non_torch_increase_in_bytes + result .torch_peak_increase_in_bytes + result .weights_memory_in_bytes # noqa
1822+
1823+
1824+ # Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
1825+ def set_ulimit (target_soft_limit = 65535 ):
1826+ resource_type = resource .RLIMIT_NOFILE
1827+ current_soft , current_hard = resource .getrlimit (resource_type )
1828+
1829+ if current_soft < target_soft_limit :
1830+ try :
1831+ resource .setrlimit (resource_type ,
1832+ (target_soft_limit , current_hard ))
1833+ except ValueError as e :
1834+ logger .warning (
1835+ "Found ulimit of %s and failed to automatically increase"
1836+ "with error %s. This can cause fd limit errors like"
1837+ "`OSError: [Errno 24] Too many open files`. Consider "
1838+ "increasing with ulimit -n" , current_soft , e )
You can’t perform that action at this time.
0 commit comments