[Bugfix] Fix routing to delete endpoint (#668)

zerofishnoodles · web-flow · commit 8367d431e286 · 2025-08-27T16:58:41.000-07:00
* [CI] remove docker image before building router image Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com> * [Bugfix] Fix routing to deleted endpoint Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com> * Revert "[CI] remove docker image before building router image" This reverts commit 8319e01. Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com> * make health check timeout configurable Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com> --------- Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -165,6 +165,7 @@ def initialize_all(app: FastAPI, args):
             prefill_model_labels=args.prefill_model_labels,
             decode_model_labels=args.decode_model_labels,
             watcher_timeout_seconds=args.k8s_watcher_timeout_seconds,
+            health_check_timeout_seconds=args.backend_health_check_timeout_seconds,
         )
 
     else:
diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -97,8 +97,6 @@ def validate_args(args):
             validate_static_model_types(args.static_model_types)
     if args.service_discovery == "k8s" and args.k8s_port is None:
         raise ValueError("K8s port must be provided when using K8s service discovery.")
-    if args.k8s_watcher_timeout_seconds <= 0:
-        raise ValueError("k8s-watcher-timeout-seconds must be greater than 0.")
     if args.routing_logic == "session" and args.session_key is None:
         raise ValueError(
             "Session key must be provided when using session routing logic."
@@ -193,8 +191,14 @@ def parse_args():
     parser.add_argument(
         "--k8s-watcher-timeout-seconds",
         type=int,
-        default=30,
-        help="Timeout in seconds for Kubernetes watcher streams (default: 30).",
+        default=0,
+        help="Timeout in seconds for Kubernetes watcher streams (default: 0).",
+    )
+    parser.add_argument(
+        "--backend-health-check-timeout-seconds",
+        type=int,
+        default=10,
+        help="Timeout in seconds for backend health check requests (default: 10).",
     )
     parser.add_argument(
         "--routing-logic",
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -350,7 +350,8 @@ def __init__(
         label_selector=None,
         prefill_model_labels: List[str] | None = None,
         decode_model_labels: List[str] | None = None,
-        watcher_timeout_seconds: int = 30,
+        watcher_timeout_seconds: int = 0,
+        health_check_timeout_seconds: int = 10,
     ):
         """
         Initialize the Kubernetes service discovery module. This module
@@ -364,7 +365,7 @@ def __init__(
             namespace: the namespace of the engine pods
             port: the port of the engines
             label_selector: the label selector of the engines
-            watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 30)
+            watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 0)
         """
         self.app = app
         self.namespace = namespace
@@ -373,6 +374,7 @@ def __init__(
         self.available_engines_lock = threading.Lock()
         self.label_selector = label_selector
         self.watcher_timeout_seconds = watcher_timeout_seconds
+        self.health_check_timeout_seconds = health_check_timeout_seconds
 
         # Init kubernetes watcher
         try:
@@ -426,7 +428,9 @@ def _get_engine_sleep_status(self, pod_ip) -> Optional[bool]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             sleep = response.json()["is_sleeping"]
             return sleep
@@ -508,7 +512,9 @@ def _get_model_names(self, pod_ip) -> List[str]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             models = response.json()["data"]
 
@@ -540,7 +546,9 @@ def _get_model_info(self, pod_ip) -> Dict[str, ModelInfo]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             models = response.json()["data"]
             # Create a dictionary of model information
@@ -582,6 +590,11 @@ def _watch_engines(self):
                     pod_name = pod.metadata.name
                     pod_ip = pod.status.pod_ip
 
+                    if event_type == "DELETED":
+                        if pod_name in self.available_engines:
+                            self._delete_engine(pod_name)
+                        continue
+
                     # Check if pod is terminating
                     is_pod_terminating = self._is_pod_terminating(pod)
                     is_container_ready = self._check_pod_ready(
@@ -755,7 +768,8 @@ def __init__(
         label_selector=None,
         prefill_model_labels: List[str] | None = None,
         decode_model_labels: List[str] | None = None,
-        watcher_timeout_seconds: int = 30,
+        watcher_timeout_seconds: int = 0,
+        health_check_timeout_seconds: int = 10,
     ):
         """
         Initialize the Kubernetes service discovery module. This module
@@ -784,7 +798,8 @@ def __init__(
             namespace: the namespace of the engine services
             port: the port of the engines
             label_selector: the label selector of the engines
-            watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 30)
+            watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 0)
+            health_check_timeout_seconds: timeout in seconds for health check requests (default: 10)
         """
         self.app = app
         self.namespace = namespace
@@ -793,6 +808,7 @@ def __init__(
         self.available_engines_lock = threading.Lock()
         self.label_selector = label_selector
         self.watcher_timeout_seconds = watcher_timeout_seconds
+        self.health_check_timeout_seconds = health_check_timeout_seconds
 
         # Init kubernetes watcher
         try:
@@ -837,7 +853,9 @@ def _get_engine_sleep_status(self, service_name) -> Optional[bool]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             sleep = response.json()["is_sleeping"]
             return sleep
@@ -931,7 +949,9 @@ def _get_model_names(self, service_name) -> List[str]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             models = response.json()["data"]
 
@@ -963,7 +983,9 @@ def _get_model_info(self, service_name) -> Dict[str, ModelInfo]:
             if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
                 logger.info("Using vllm server authentication")
                 headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
-            response = requests.get(url, headers=headers)
+            response = requests.get(
+                url, headers=headers, timeout=self.health_check_timeout_seconds
+            )
             response.raise_for_status()
             models = response.json()["data"]
             # Create a dictionary of model information
@@ -1002,6 +1024,10 @@ def _watch_engines(self):
                 ):
                     service = event["object"]
                     event_type = event["type"]
+                    if event_type == "DELETED":
+                        if service.metadata.name in self.available_engines:
+                            self._delete_engine(service.metadata.name)
+                        continue
                     service_name = service.metadata.name
                     is_service_ready = self._check_service_ready(
                         service_name, self.namespace

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,7 @@ def initialize_all(app: FastAPI, args):`
`165`	`165`	`prefill_model_labels=args.prefill_model_labels,`
`166`	`166`	`decode_model_labels=args.decode_model_labels,`
`167`	`167`	`watcher_timeout_seconds=args.k8s_watcher_timeout_seconds,`
	`168`	`+ health_check_timeout_seconds=args.backend_health_check_timeout_seconds,`
`168`	`169`	`)`
`169`	`170`
`170`	`171`	`else:`