feat: add endpoint health checks to static router (#428)

max-wittig · web-flow · commit 9b127b26e99f · 2025-05-28T02:20:56.000-05:00
Closes #420 Signed-off-by: Max Wittig <max.wittig@siemens.com>
diff --git a/src/tests/test_parser.py b/src/tests/test_parser.py
@@ -77,3 +77,37 @@ def test_load_initial_config_from_config_json_if_required_when_config_file_is_pr
             test_parser, args
         )
         assert args.routing_logic == "roundrobin"
+
+
+def test_validate_args_when_service_discovery_is_set_to_static_and_static_backend_health_checks_is_set_and_static_model_types_is_not_set_raises_value_error() -> (
+    None
+):
+    with pytest.raises(ValueError):
+        parser.validate_args(
+            MagicMock(
+                routing_logic="roundrobin",
+                service_discovery="static",
+                static_backend_health_checks=True,
+                static_model_types=None,
+            )
+        )
+
+
+def test_validate_static_model_types_when_model_types_is_not_defines_raises_value_error() -> (
+    None
+):
+    with pytest.raises(ValueError):
+        parser.validate_static_model_types(None)
+
+
+def test_validate_static_model_types_when_model_types_contains_unsupported_model_type_raises_value_error() -> (
+    None
+):
+    with pytest.raises(ValueError):
+        parser.validate_static_model_types("chat,unsupported")
+
+
+def test_validate_static_model_types_when_model_types_contains_only_supported_model_types_does_not_raise_error() -> (
+    None
+):
+    parser.validate_static_model_types("chat,completion,rerank,score")
diff --git a/src/tests/test_static_service_discovery.py b/src/tests/test_static_service_discovery.py
@@ -0,0 +1,111 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm_router.service_discovery import StaticServiceDiscovery
+
+
+def test_init_when_static_backend_health_checks_calls_start_health_checks(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    start_health_check_mock = MagicMock()
+    monkeypatch.setattr(
+        "vllm_router.service_discovery.StaticServiceDiscovery.start_health_check_task",
+        start_health_check_mock,
+    )
+    discovery_instance = StaticServiceDiscovery(
+        [], [], None, None, None, static_backend_health_checks=True
+    )
+    discovery_instance.start_health_check_task.assert_called_once()
+
+
+def test_init_when_endpoint_health_check_disabled_does_not_call_start_health_checks(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    start_health_check_mock = MagicMock()
+    monkeypatch.setattr(
+        "vllm_router.service_discovery.StaticServiceDiscovery.start_health_check_task",
+        start_health_check_mock,
+    )
+    discovery_instance = StaticServiceDiscovery(
+        [], [], None, None, None, static_backend_health_checks=False
+    )
+    discovery_instance.start_health_check_task.assert_not_called()
+
+
+def test_get_unhealthy_endpoint_hashes_when_only_healthy_models_exist_does_not_return_unhealthy_endpoint_hashes(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("vllm_router.utils.is_model_healthy", lambda *_: True)
+    discovery_instance = StaticServiceDiscovery(
+        ["http://localhost.com"],
+        ["llama3"],
+        None,
+        None,
+        ["chat"],
+        static_backend_health_checks=True,
+    )
+    assert discovery_instance.get_unhealthy_endpoint_hashes() == []
+
+
+def test_get_unhealthy_endpoint_hashes_when_unhealthy_model_exist_returns_unhealthy_endpoint_hash(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("vllm_router.utils.is_model_healthy", lambda *_: False)
+    discovery_instance = StaticServiceDiscovery(
+        ["http://localhost.com"],
+        ["llama3"],
+        None,
+        None,
+        ["chat"],
+        static_backend_health_checks=False,
+    )
+    assert discovery_instance.get_unhealthy_endpoint_hashes() == [
+        "ee7d421a744e07595b70f98c11be93e7"
+    ]
+
+
+def test_get_unhealthy_endpoint_hashes_when_healthy_and_unhealthy_models_exist_returns_only_unhealthy_endpoint_hash(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    unhealthy_model = "bge-m3"
+
+    def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
+        return model != unhealthy_model
+
+    monkeypatch.setattr("vllm_router.utils.is_model_healthy", mock_is_model_healthy)
+    discovery_instance = StaticServiceDiscovery(
+        ["http://localhost.com", "http://10.123.112.412"],
+        ["llama3", unhealthy_model],
+        None,
+        None,
+        ["chat", "embeddings"],
+        static_backend_health_checks=False,
+    )
+    assert discovery_instance.get_unhealthy_endpoint_hashes() == [
+        "01e1b07eca36d39acacd55a33272a225"
+    ]
+
+
+def test_get_endpoint_info_when_model_endpoint_hash_is_in_unhealthy_endpoint_does_not_return_endpoint(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    unhealthy_model = "mistral"
+
+    def mock_get_model_endpoint_hash(url: str, model: str) -> str:
+        return "some-hash" if model == unhealthy_model else "other-hash"
+
+    discovery_instance = StaticServiceDiscovery(
+        ["http://localhost.com", "http://10.123.112.412"],
+        ["llama3", unhealthy_model],
+        None,
+        None,
+        ["chat", "chat"],
+        static_backend_health_checks=False,
+    )
+    discovery_instance.unhealthy_endpoint_hashes = ["some-hash"]
+    monkeypatch.setattr(
+        discovery_instance, "get_model_endpoint_hash", mock_get_model_endpoint_hash
+    )
+    assert len(discovery_instance.get_endpoint_info()) == 1
+    assert discovery_instance.get_endpoint_info()[0].model_name == "llama3"
diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py
@@ -2,6 +2,7 @@
 from unittest.mock import MagicMock
 
 import pytest
+import requests
 from starlette.datastructures import MutableHeaders
 
 from vllm_router import utils
@@ -82,3 +83,27 @@ def test_get_all_fields_returns_list_of_strings() -> None:
     fields = utils.ModelType.get_all_fields()
     assert isinstance(fields, list)
     assert isinstance(fields[0], str)
+
+
+def test_is_model_healthy_when_requests_responds_with_status_code_200_returns_true(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request_mock = MagicMock(return_value=MagicMock(status_code=200))
+    monkeypatch.setattr("requests.post", request_mock)
+    assert utils.is_model_healthy("http://localhost", "test", "chat") is True
+
+
+def test_is_model_healthy_when_requests_raises_exception_returns_false(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request_mock = MagicMock(side_effect=requests.exceptions.ReadTimeout)
+    monkeypatch.setattr("requests.post", request_mock)
+    assert utils.is_model_healthy("http://localhost", "test", "chat") is False
+
+
+def test_is_model_healthy_when_requests_status_with_status_code_not_200_returns_false(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    request_mock = MagicMock(return_value=MagicMock(status_code=500))
+    monkeypatch.setattr("requests.post", request_mock)
+    assert utils.is_model_healthy("http://localhost", "test", "chat") is False
diff --git a/src/vllm_router/README.md b/src/vllm_router/README.md
@@ -28,6 +28,7 @@ The router can be configured using command-line arguments. Below are the availab
 - `--static-backends`: The URLs of static serving engines, separated by commas (e.g., `http://localhost:8000,http://localhost:8001`).
 - `--static-models`: The models running in the static serving engines, separated by commas (e.g., `model1,model2`).
 - `--static-aliases`: The aliases of the models running in the static serving engines, separated by commas and associated using colons (e.g., `model_alias1:model,mode_alias2:model`).
+- `--static-backend-health-checks`: Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.
 - `--k8s-port`: The port of vLLM processes when using K8s service discovery. Default is `8000`.
 - `--k8s-namespace`: The namespace of vLLM pods when using K8s service discovery. Default is `default`.
 - `--k8s-label-selector`: The label selector to filter vLLM pods when using K8s service discovery.
@@ -82,11 +83,25 @@ vllm-router --port 8000 \
     --static-backends "http://localhost:9001,http://localhost:9002,http://localhost:9003" \
     --static-models "facebook/opt-125m,meta-llama/Llama-3.1-8B-Instruct,facebook/opt-125m" \
     --static-aliases "gpt4:meta-llama/Llama-3.1-8B-Instruct" \
+    --static-model-types "chat,chat,chat" \
+    --static-backend-health-checks \
     --engine-stats-interval 10 \
     --log-stats \
     --routing-logic roundrobin
 ```
 
+## Backend health checks
+
+By enabling the `--static-backend-health-checks` flag, **vllm-router** will send a simple request to
+your LLM nodes every minute to verify that they still work.
+If a node is down, it will output a warning and exclude the node from being routed to.
+
+If you enable this flag, its also required that you specify `--static-model-types` as we have to use
+different endpoints for each model type.
+
+> Enabling this flag will put some load on your backend every minute as real requests are send to the nodes
+> to test their functionality.
+
 ## Dynamic Router Config
 
 The router can be configured dynamically using a json file when passing the `--dynamic-config-json` option.
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -147,6 +147,7 @@ def initialize_all(app: FastAPI, args):
                 if args.static_model_labels
                 else None
             ),
+            static_backend_health_checks=args.static_backend_health_checks,
         )
     elif args.service_discovery == "k8s":
         initialize_service_discovery(
diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -16,6 +16,7 @@
 import logging
 import sys
 
+from vllm_router import utils
 from vllm_router.version import __version__
 
 try:
@@ -65,6 +66,19 @@ def load_initial_config_from_config_json_if_required(
     return args
 
 
+def validate_static_model_types(model_types: str | None) -> None:
+    if model_types is None:
+        raise ValueError(
+            "Static model types must be provided when using the backend healthcheck."
+        )
+    all_models = utils.ModelType.get_all_fields()
+    for model_type in utils.parse_comma_separated_args(model_types):
+        if model_type not in all_models:
+            raise ValueError(
+                f"The model type '{model_type}' is not supported. Supported model types are '{','.join(all_models)}'"
+            )
+
+
 # --- Argument Parsing and Initialization ---
 def validate_args(args):
     verify_required_args_provided(args)
@@ -77,6 +91,8 @@ def validate_args(args):
             raise ValueError(
                 "Static models must be provided when using static service discovery."
             )
+        if args.static_backend_health_checks:
+            validate_static_model_types(args.static_model_types)
     if args.service_discovery == "k8s" and args.k8s_port is None:
         raise ValueError("K8s port must be provided when using K8s service discovery.")
     if args.routing_logic == "session" and args.session_key is None:
@@ -135,6 +151,11 @@ def parse_args():
         default=None,
         help="The model labels of static backends, separated by commas. E.g., model1,model2",
     )
+    parser.add_argument(
+        "--static-backend-health-checks",
+        action="store_true",
+        help="Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.",
+    )
     parser.add_argument(
         "--k8s-port",
         type=int,
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 import abc
+import asyncio
 import enum
+import hashlib
 import os
 import threading
 import time
@@ -23,6 +25,7 @@
 import requests
 from kubernetes import client, config, watch
 
+from vllm_router import utils
 from vllm_router.log import init_logger
 
 logger = init_logger(__name__)
@@ -86,6 +89,7 @@ def __init__(
         aliases: List[str] | None,
         model_labels: List[str] | None,
         model_types: List[str] | None,
+        static_backend_health_checks: bool,
     ):
         assert len(urls) == len(models), "URLs and models should have the same length"
         self.urls = urls
@@ -94,6 +98,37 @@ def __init__(
         self.model_labels = model_labels
         self.model_types = model_types
         self.added_timestamp = int(time.time())
+        self.unhealthy_endpoint_hashes = []
+        if static_backend_health_checks:
+            self.start_health_check_task()
+
+    def get_unhealthy_endpoint_hashes(self) -> list[str]:
+        unhealthy_endpoints = []
+        for url, model, model_type in zip(self.urls, self.models, self.model_types):
+            if utils.is_model_healthy(url, model, model_type):
+                logger.debug(f"{model} at {url} is healthy")
+            else:
+                logger.warning(f"{model} at {url} not healthy!")
+                unhealthy_endpoints.append(self.get_model_endpoint_hash(url, model))
+        return unhealthy_endpoints
+
+    async def check_model_health(self):
+        while True:
+            try:
+                self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes()
+                time.sleep(60)
+            except Exception as e:
+                logger.error(e)
+
+    def start_health_check_task(self) -> None:
+        self.loop = asyncio.new_event_loop()
+        self.thread = threading.Thread(target=self.loop.run_forever, daemon=True)
+        self.thread.start()
+        asyncio.run_coroutine_threadsafe(self.check_model_health(), self.loop)
+        logger.info("Health check thread started")
+
+    def get_model_endpoint_hash(self, url: str, model: str) -> str:
+        return hashlib.md5(f"{url}{model}".encode()).hexdigest()
 
     def get_endpoint_info(self) -> List[EndpointInfo]:
         """
@@ -103,18 +138,16 @@ def get_endpoint_info(self) -> List[EndpointInfo]:
         Returns:
             a list of engine URLs
         """
-        if self.model_labels is None:
-            endpoint_infos = [
-                EndpointInfo(url, model, self.added_timestamp, "default")
-                for url, model in zip(self.urls, self.models)
-            ]
-        else:
-            endpoint_infos = [
-                EndpointInfo(url, model, self.added_timestamp, model_label)
-                for url, model, model_label in zip(
-                    self.urls, self.models, self.model_labels
-                )
-            ]
+        if not self.model_labels:
+            self.model_labels = ["default"] * len(self.models)
+        endpoint_infos = [
+            EndpointInfo(url, model, self.added_timestamp, model_label)
+            for url, model, model_label in zip(
+                self.urls, self.models, self.model_labels
+            )
+            if self.get_model_endpoint_hash(url, model)
+            not in self.unhealthy_endpoint_hashes
+        ]
         return endpoint_infos
 
 
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -147,6 +147,7 @@ def initialize_all(app: FastAPI, args):`
`147`	`147`	`if args.static_model_labels`
`148`	`148`	`else None`
`149`	`149`	`),`
	`150`	`+ static_backend_health_checks=args.static_backend_health_checks,`
`150`	`151`	`)`
`151`	`152`	`elif args.service_discovery == "k8s":`
`152`	`153`	`initialize_service_discovery(`