[Benchmark] Support ready check timeout in vllm bench serve (#21696)

yeqcharlotte · Roger Wang · web-flow · commit 3f36c325fa6c · 2025-08-03T00:52:38.000-07:00
Signed-off-by: Ye (Charlotte) Qi &lt;yeq@meta.com&gt;
Co-authored-by: Roger Wang &lt;hey@rogerw.me&gt;
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
@@ -14,8 +14,8 @@
 
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.sampling_params import BeamSearchParams
diff --git a/vllm/benchmarks/lib/__init__.py b/vllm/benchmarks/lib/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark library utilities."""
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for checking endpoint readiness."""
+
+import asyncio
+import time
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
+
+
+async def wait_for_endpoint(
+    request_func,
+    test_input: RequestFuncInput,
+    timeout_seconds: int = 600,
+    retry_interval: int = 5,
+) -> RequestFuncOutput:
+    """
+    Wait for an endpoint to become available before starting benchmarks.
+    
+    Args:
+        request_func: The async request function to call
+        test_input: The RequestFuncInput to test with
+        timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
+        retry_interval: Time between retries in seconds (default: 5 seconds)
+        
+    Returns:
+        RequestFuncOutput: The successful response
+        
+    Raises:
+        ValueError: If the endpoint doesn't become available within the timeout
+    """
+    deadline = time.perf_counter() + timeout_seconds
+    output = RequestFuncOutput(success=False)
+    print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
+    
+    with tqdm(
+        total=timeout_seconds, 
+        bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
+        unit="s",
+    ) as pbar:
+
+        while True:            
+            # update progress bar
+            remaining = deadline - time.perf_counter()
+            elapsed = timeout_seconds - remaining
+            update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
+            pbar.update(update_amount)
+            pbar.refresh()
+            if remaining <= 0:
+                pbar.close()
+                break
+
+            # ping the endpoint using request_func
+            try:
+                output = await request_func(request_func_input=test_input)
+                if output.success:
+                    pbar.close()
+                    return output
+            except aiohttp.ClientConnectorError:
+                pass
+            
+            # retry after a delay
+            sleep_duration = min(retry_interval, remaining)
+            if sleep_duration > 0:
+                await asyncio.sleep(sleep_duration)
+    
+    return output
diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
@@ -34,12 +34,12 @@
 
 from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
                                       get_samples)
-from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
-                                                   OPENAI_COMPATIBLE_BACKENDS,
-                                                   RequestFuncInput,
-                                                   RequestFuncOutput)
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.endpoint_request_func import (
+    ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
+    RequestFuncOutput)
+from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -331,6 +331,7 @@ async def benchmark(
     ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
     ramp_up_start_rps: Optional[int] = None,
     ramp_up_end_rps: Optional[int] = None,
+    ready_check_timeout_sec: int = 600,
 ):
     if endpoint_type in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -359,7 +360,8 @@ async def benchmark(
         extra_body=extra_body,
     )
 
-    test_output = await request_func(request_func_input=test_input)
+    test_output = await wait_for_endpoint(
+        request_func, test_input, timeout_seconds=ready_check_timeout_sec)
     if not test_output.success:
         raise ValueError(
             "Initial test run failed - Please make sure benchmark arguments "
@@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="The ending request rate for ramp-up (RPS). "
         "Needs to be specified when --ramp-up-strategy is used.",
     )
+    parser.add_argument(
+        "--ready-check-timeout-sec",
+        type=int,
+        default=600,
+        help="Maximum time to wait for the endpoint to become ready "
+        "in seconds (default: 600 seconds / 10 minutes).",
+    )
 
 
 def main(args: argparse.Namespace):
@@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace):
             ramp_up_strategy=args.ramp_up_strategy,
             ramp_up_start_rps=args.ramp_up_start_rps,
             ramp_up_end_rps=args.ramp_up_end_rps,
+            ready_check_timeout_sec=args.ready_check_timeout_sec,
         ))
 
     # Save config and results to json
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
@@ -21,8 +21,8 @@
                                       InstructCoderDataset, RandomDataset,
                                       SampleRequest, ShareGPTDataset,
                                       SonnetDataset, VisionArenaDataset)
-from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
-                                   write_to_json)
+from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
+                                       write_to_json)
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
	`3`	`+"""Benchmark library utilities."""`