vllm-project · max-wittig · Jun 25, 2025
diff --git a/src/tests/test_static_service_discovery.py b/src/tests/test_static_service_discovery.py
@@ -21,6 +21,7 @@ def test_init_when_static_backend_health_checks_calls_start_health_checks(
         None,
         None,
         static_backend_health_checks=True,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -43,6 +44,7 @@ def test_init_when_endpoint_health_check_disabled_does_not_call_start_health_che
         None,
         None,
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -61,6 +63,7 @@ def test_get_unhealthy_endpoint_hashes_when_only_healthy_models_exist_does_not_r
         None,
         ["chat"],
         static_backend_health_checks=True,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -79,6 +82,7 @@ def test_get_unhealthy_endpoint_hashes_when_unhealthy_model_exist_returns_unheal
         None,
         ["chat"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -92,7 +96,9 @@ def test_get_unhealthy_endpoint_hashes_when_healthy_and_unhealthy_models_exist_r
 ) -> None:
     unhealthy_model = "bge-m3"
 
-    def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
+    def mock_is_model_healthy(
+        url: str, model: str, model_type: str, timeout: int = 30
+    ) -> bool:
         return model != unhealthy_model
 
     monkeypatch.setattr("vllm_router.utils.is_model_healthy", mock_is_model_healthy)
@@ -104,6 +110,7 @@ def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
         None,
         ["chat", "embeddings"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )
@@ -128,6 +135,7 @@ def mock_get_model_endpoint_hash(url: str, model: str) -> str:
         None,
         ["chat", "chat"],
         static_backend_health_checks=False,
+        static_backend_health_checks_timeout=30,
         prefill_model_labels=None,
         decode_model_labels=None,
     )

diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -149,6 +149,7 @@ def initialize_all(app: FastAPI, args):
                 else None
             ),
             static_backend_health_checks=args.static_backend_health_checks,
+            static_backend_health_checks_timeout=args.static_backend_health_checks_timeout,
             prefill_model_labels=args.prefill_model_labels,
             decode_model_labels=args.decode_model_labels,
         )

diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -156,6 +156,12 @@ def parse_args():
         action="store_true",
         help="Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.",
     )
+    parser.add_argument(
+        "--static-backend-health-checks-timeout",
+        type=int,
+        help="Timeout in seconds for dummy requests sent using the static backend health check. Defaults to 30.",
+        default=30,
+    )
     parser.add_argument(
         "--k8s-port",
         type=int,

diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -210,6 +210,7 @@ def __init__(
         model_labels: List[str] | None,
         model_types: List[str] | None,
         static_backend_health_checks: bool,
+        static_backend_health_checks_timeout: int,
         prefill_model_labels: List[str] | None,
         decode_model_labels: List[str] | None,
     ):
@@ -223,6 +224,7 @@ def __init__(
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
+        self.static_backend_health_checks_timeout = static_backend_health_checks_timeout
         if static_backend_health_checks:
             self.start_health_check_task()
         self.prefill_model_labels = prefill_model_labels
@@ -231,7 +233,9 @@ def __init__(
     def get_unhealthy_endpoint_hashes(self) -> list[str]:
         unhealthy_endpoints = []
         for url, model, model_type in zip(self.urls, self.models, self.model_types):
-            if utils.is_model_healthy(url, model, model_type):
+            if utils.is_model_healthy(
+                url, model, model_type, self.static_backend_health_checks_timeout
+            ):
                 logger.debug(f"{model} at {url} is healthy")
             else:
                 logger.warning(f"{model} at {url} not healthy!")

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
@@ -157,14 +157,14 @@ def update_content_length(request: Request, request_body: str):
     request._headers = headers
 
 
-def is_model_healthy(url: str, model: str, model_type: str) -> bool:
+def is_model_healthy(url: str, model: str, model_type: str, timeout: int = 30) -> bool:
     model_details = ModelType[model_type]
     try:
         response = requests.post(
             f"{url}{model_details.value}",
             headers={"Content-Type": "application/json"},
             json={"model": model} | model_details.get_test_payload(model_type),
-            timeout=30,
+            timeout=timeout,
         )
     except Exception as e:
         logger.error(e)