Skip to content

Commit 3f36c32

Browse files
yeqcharlotteRoger Wang
andauthored
[Benchmark] Support ready check timeout in vllm bench serve (#21696)
Signed-off-by: Ye (Charlotte) Qi <[email protected]> Co-authored-by: Roger Wang <[email protected]>
1 parent 3dddbf1 commit 3f36c32

File tree

7 files changed

+94
-11
lines changed

7 files changed

+94
-11
lines changed

vllm/benchmarks/latency.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
import vllm.envs as envs
1616
from vllm import LLM, SamplingParams
17-
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
18-
write_to_json)
17+
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
18+
write_to_json)
1919
from vllm.engine.arg_utils import EngineArgs
2020
from vllm.inputs import PromptType
2121
from vllm.sampling_params import BeamSearchParams

vllm/benchmarks/lib/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""Benchmark library utilities."""

vllm/benchmarks/lib/ready_checker.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""Utilities for checking endpoint readiness."""
4+
5+
import asyncio
6+
import time
7+
8+
import aiohttp
9+
from tqdm.asyncio import tqdm
10+
11+
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
12+
13+
14+
async def wait_for_endpoint(
15+
request_func,
16+
test_input: RequestFuncInput,
17+
timeout_seconds: int = 600,
18+
retry_interval: int = 5,
19+
) -> RequestFuncOutput:
20+
"""
21+
Wait for an endpoint to become available before starting benchmarks.
22+
23+
Args:
24+
request_func: The async request function to call
25+
test_input: The RequestFuncInput to test with
26+
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
27+
retry_interval: Time between retries in seconds (default: 5 seconds)
28+
29+
Returns:
30+
RequestFuncOutput: The successful response
31+
32+
Raises:
33+
ValueError: If the endpoint doesn't become available within the timeout
34+
"""
35+
deadline = time.perf_counter() + timeout_seconds
36+
output = RequestFuncOutput(success=False)
37+
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
38+
39+
with tqdm(
40+
total=timeout_seconds,
41+
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
42+
unit="s",
43+
) as pbar:
44+
45+
while True:
46+
# update progress bar
47+
remaining = deadline - time.perf_counter()
48+
elapsed = timeout_seconds - remaining
49+
update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
50+
pbar.update(update_amount)
51+
pbar.refresh()
52+
if remaining <= 0:
53+
pbar.close()
54+
break
55+
56+
# ping the endpoint using request_func
57+
try:
58+
output = await request_func(request_func_input=test_input)
59+
if output.success:
60+
pbar.close()
61+
return output
62+
except aiohttp.ClientConnectorError:
63+
pass
64+
65+
# retry after a delay
66+
sleep_duration = min(retry_interval, remaining)
67+
if sleep_duration > 0:
68+
await asyncio.sleep(sleep_duration)
69+
70+
return output
File renamed without changes.

vllm/benchmarks/serve.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@
3434

3535
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
3636
get_samples)
37-
from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
38-
OPENAI_COMPATIBLE_BACKENDS,
39-
RequestFuncInput,
40-
RequestFuncOutput)
41-
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
42-
write_to_json)
37+
from vllm.benchmarks.lib.endpoint_request_func import (
38+
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
39+
RequestFuncOutput)
40+
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
41+
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
42+
write_to_json)
4343
from vllm.transformers_utils.tokenizer import get_tokenizer
4444

4545
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -331,6 +331,7 @@ async def benchmark(
331331
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
332332
ramp_up_start_rps: Optional[int] = None,
333333
ramp_up_end_rps: Optional[int] = None,
334+
ready_check_timeout_sec: int = 600,
334335
):
335336
if endpoint_type in ASYNC_REQUEST_FUNCS:
336337
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -359,7 +360,8 @@ async def benchmark(
359360
extra_body=extra_body,
360361
)
361362

362-
test_output = await request_func(request_func_input=test_input)
363+
test_output = await wait_for_endpoint(
364+
request_func, test_input, timeout_seconds=ready_check_timeout_sec)
363365
if not test_output.success:
364366
raise ValueError(
365367
"Initial test run failed - Please make sure benchmark arguments "
@@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
907909
help="The ending request rate for ramp-up (RPS). "
908910
"Needs to be specified when --ramp-up-strategy is used.",
909911
)
912+
parser.add_argument(
913+
"--ready-check-timeout-sec",
914+
type=int,
915+
default=600,
916+
help="Maximum time to wait for the endpoint to become ready "
917+
"in seconds (default: 600 seconds / 10 minutes).",
918+
)
910919

911920

912921
def main(args: argparse.Namespace):
@@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace):
10121021
ramp_up_strategy=args.ramp_up_strategy,
10131022
ramp_up_start_rps=args.ramp_up_start_rps,
10141023
ramp_up_end_rps=args.ramp_up_end_rps,
1024+
ready_check_timeout_sec=args.ready_check_timeout_sec,
10151025
))
10161026

10171027
# Save config and results to json

vllm/benchmarks/throughput.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
InstructCoderDataset, RandomDataset,
2222
SampleRequest, ShareGPTDataset,
2323
SonnetDataset, VisionArenaDataset)
24-
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
25-
write_to_json)
24+
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
25+
write_to_json)
2626
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
2727
from vllm.entrypoints.openai.api_server import (
2828
build_async_engine_client_from_engine_args)

0 commit comments

Comments
 (0)