feat(router): make timeout customizable

max-wittig · max-wittig · commit 7ce50b090615 · 2025-06-25T16:28:30.000+02:00
30 second timeout per model is enough for most people
but somebody might use a very big model or use slow machines
so it should also work for this use case.

Customizing the health check interval will be done in
another MR.

Signed-off-by: Max Wittig &lt;max.wittig@siemens.com&gt;
diff --git a/src/tests/test_static_service_discovery.py b/src/tests/test_static_service_discovery.py
@@ -21,6 +21,7 @@ def test_init_when_static_backend_health_checks_calls_start_health_checks(
         None,
         None,
         static_backend_health_checks=True,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -43,6 +44,7 @@ def test_init_when_endpoint_health_check_disabled_does_not_call_start_health_che
         None,
         None,
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -61,6 +63,7 @@ def test_get_unhealthy_endpoint_hashes_when_only_healthy_models_exist_does_not_r
         None,
         ["chat"],
         static_backend_health_checks=True,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -79,6 +82,7 @@ def test_get_unhealthy_endpoint_hashes_when_unhealthy_model_exist_returns_unheal
         None,
         ["chat"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -92,7 +96,7 @@ def test_get_unhealthy_endpoint_hashes_when_healthy_and_unhealthy_models_exist_r
 ) -> None:
     unhealthy_model = "bge-m3"
 
-    def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
+    def mock_is_model_healthy(url: str, model: str, model_type: str, timeout: int = 30) -> bool:
         return model != unhealthy_model
 
     monkeypatch.setattr("vllm_router.utils.is_model_healthy", mock_is_model_healthy)
@@ -104,6 +108,7 @@ def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
         None,
         ["chat", "embeddings"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -128,6 +133,7 @@ def mock_get_model_endpoint_hash(url: str, model: str) -> str:
         None,
         ["chat", "chat"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -149,6 +149,7 @@ def initialize_all(app: FastAPI, args):
                 else None
             ),
             static_backend_health_checks=args.static_backend_health_checks,
+            static_backend_health_checks_timeout=args.static_backend_health_checks_timeout,
             prefill_model_labels=args.prefill_model_labels,
             decode_model_labels=args.decode_model_labels,
         )
diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -156,6 +156,12 @@ def parse_args():
         action="store_true",
         help="Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.",
     )
+    parser.add_argument(
+        "--static-backend-health-checks-timeout",
+        type=int,
+        help="Timeout in seconds for dummy requests sent using the static backend health check. Defaults to 30.",
+        default=30,
+    )
     parser.add_argument(
         "--k8s-port",
         type=int,
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -210,6 +210,7 @@ def __init__(
         model_labels: List[str] | None,
         model_types: List[str] | None,
         static_backend_health_checks: bool,
+        static_backend_health_checks_timeout: int,
         prefill_model_labels: List[str] | None,
         decode_model_labels: List[str] | None,
     ):
@@ -223,6 +224,7 @@ def __init__(
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
+        self.static_backend_health_checks_timeout = static_backend_health_checks_timeout
         if static_backend_health_checks:
             self.start_health_check_task()
         self.prefill_model_labels = prefill_model_labels
@@ -231,7 +233,7 @@ def __init__(
     def get_unhealthy_endpoint_hashes(self) -> list[str]:
         unhealthy_endpoints = []
         for url, model, model_type in zip(self.urls, self.models, self.model_types):
-            if utils.is_model_healthy(url, model, model_type):
+            if utils.is_model_healthy(url, model, model_type, self.static_backend_health_checks_timeout):
                 logger.debug(f"{model} at {url} is healthy")
             else:
                 logger.warning(f"{model} at {url} not healthy!")
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
@@ -157,14 +157,14 @@ def update_content_length(request: Request, request_body: str):
     request._headers = headers
 
 
-def is_model_healthy(url: str, model: str, model_type: str) -> bool:
+def is_model_healthy(url: str, model: str, model_type: str, timeout: int = 30) -> bool:
     model_details = ModelType[model_type]
     try:
         response = requests.post(
             f"{url}{model_details.value}",
             headers={"Content-Type": "application/json"},
             json={"model": model} | model_details.get_test_payload(model_type),
-            timeout=30,
+            timeout=timeout,
         )
     except Exception as e:
         logger.error(e)

Original file line number	Diff line number	Diff line change
`@@ -149,6 +149,7 @@ def initialize_all(app: FastAPI, args):`
`149`	`149`	`else None`
`150`	`150`	`),`
`151`	`151`	`static_backend_health_checks=args.static_backend_health_checks,`
	`152`	`+ static_backend_health_checks_timeout=args.static_backend_health_checks_timeout,`
`152`	`153`	`prefill_model_labels=args.prefill_model_labels,`
`153`	`154`	`decode_model_labels=args.decode_model_labels,`
`154`	`155`	`)`