@@ -350,7 +350,8 @@ def __init__(
350
350
label_selector = None ,
351
351
prefill_model_labels : List [str ] | None = None ,
352
352
decode_model_labels : List [str ] | None = None ,
353
- watcher_timeout_seconds : int = 30 ,
353
+ watcher_timeout_seconds : int = 0 ,
354
+ health_check_timeout_seconds : int = 10 ,
354
355
):
355
356
"""
356
357
Initialize the Kubernetes service discovery module. This module
@@ -364,7 +365,7 @@ def __init__(
364
365
namespace: the namespace of the engine pods
365
366
port: the port of the engines
366
367
label_selector: the label selector of the engines
367
- watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 30 )
368
+ watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 0 )
368
369
"""
369
370
self .app = app
370
371
self .namespace = namespace
@@ -373,6 +374,7 @@ def __init__(
373
374
self .available_engines_lock = threading .Lock ()
374
375
self .label_selector = label_selector
375
376
self .watcher_timeout_seconds = watcher_timeout_seconds
377
+ self .health_check_timeout_seconds = health_check_timeout_seconds
376
378
377
379
# Init kubernetes watcher
378
380
try :
@@ -426,7 +428,9 @@ def _get_engine_sleep_status(self, pod_ip) -> Optional[bool]:
426
428
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
427
429
logger .info ("Using vllm server authentication" )
428
430
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
429
- response = requests .get (url , headers = headers )
431
+ response = requests .get (
432
+ url , headers = headers , timeout = self .health_check_timeout_seconds
433
+ )
430
434
response .raise_for_status ()
431
435
sleep = response .json ()["is_sleeping" ]
432
436
return sleep
@@ -508,7 +512,9 @@ def _get_model_names(self, pod_ip) -> List[str]:
508
512
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
509
513
logger .info ("Using vllm server authentication" )
510
514
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
511
- response = requests .get (url , headers = headers )
515
+ response = requests .get (
516
+ url , headers = headers , timeout = self .health_check_timeout_seconds
517
+ )
512
518
response .raise_for_status ()
513
519
models = response .json ()["data" ]
514
520
@@ -540,7 +546,9 @@ def _get_model_info(self, pod_ip) -> Dict[str, ModelInfo]:
540
546
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
541
547
logger .info ("Using vllm server authentication" )
542
548
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
543
- response = requests .get (url , headers = headers )
549
+ response = requests .get (
550
+ url , headers = headers , timeout = self .health_check_timeout_seconds
551
+ )
544
552
response .raise_for_status ()
545
553
models = response .json ()["data" ]
546
554
# Create a dictionary of model information
@@ -582,6 +590,11 @@ def _watch_engines(self):
582
590
pod_name = pod .metadata .name
583
591
pod_ip = pod .status .pod_ip
584
592
593
+ if event_type == "DELETED" :
594
+ if pod_name in self .available_engines :
595
+ self ._delete_engine (pod_name )
596
+ continue
597
+
585
598
# Check if pod is terminating
586
599
is_pod_terminating = self ._is_pod_terminating (pod )
587
600
is_container_ready = self ._check_pod_ready (
@@ -755,7 +768,8 @@ def __init__(
755
768
label_selector = None ,
756
769
prefill_model_labels : List [str ] | None = None ,
757
770
decode_model_labels : List [str ] | None = None ,
758
- watcher_timeout_seconds : int = 30 ,
771
+ watcher_timeout_seconds : int = 0 ,
772
+ health_check_timeout_seconds : int = 10 ,
759
773
):
760
774
"""
761
775
Initialize the Kubernetes service discovery module. This module
@@ -784,7 +798,8 @@ def __init__(
784
798
namespace: the namespace of the engine services
785
799
port: the port of the engines
786
800
label_selector: the label selector of the engines
787
- watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 30)
801
+ watcher_timeout_seconds: timeout in seconds for Kubernetes watcher streams (default: 0)
802
+ health_check_timeout_seconds: timeout in seconds for health check requests (default: 10)
788
803
"""
789
804
self .app = app
790
805
self .namespace = namespace
@@ -793,6 +808,7 @@ def __init__(
793
808
self .available_engines_lock = threading .Lock ()
794
809
self .label_selector = label_selector
795
810
self .watcher_timeout_seconds = watcher_timeout_seconds
811
+ self .health_check_timeout_seconds = health_check_timeout_seconds
796
812
797
813
# Init kubernetes watcher
798
814
try :
@@ -837,7 +853,9 @@ def _get_engine_sleep_status(self, service_name) -> Optional[bool]:
837
853
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
838
854
logger .info ("Using vllm server authentication" )
839
855
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
840
- response = requests .get (url , headers = headers )
856
+ response = requests .get (
857
+ url , headers = headers , timeout = self .health_check_timeout_seconds
858
+ )
841
859
response .raise_for_status ()
842
860
sleep = response .json ()["is_sleeping" ]
843
861
return sleep
@@ -931,7 +949,9 @@ def _get_model_names(self, service_name) -> List[str]:
931
949
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
932
950
logger .info ("Using vllm server authentication" )
933
951
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
934
- response = requests .get (url , headers = headers )
952
+ response = requests .get (
953
+ url , headers = headers , timeout = self .health_check_timeout_seconds
954
+ )
935
955
response .raise_for_status ()
936
956
models = response .json ()["data" ]
937
957
@@ -963,7 +983,9 @@ def _get_model_info(self, service_name) -> Dict[str, ModelInfo]:
963
983
if VLLM_API_KEY := os .getenv ("VLLM_API_KEY" ):
964
984
logger .info ("Using vllm server authentication" )
965
985
headers = {"Authorization" : f"Bearer { VLLM_API_KEY } " }
966
- response = requests .get (url , headers = headers )
986
+ response = requests .get (
987
+ url , headers = headers , timeout = self .health_check_timeout_seconds
988
+ )
967
989
response .raise_for_status ()
968
990
models = response .json ()["data" ]
969
991
# Create a dictionary of model information
@@ -1002,6 +1024,10 @@ def _watch_engines(self):
1002
1024
):
1003
1025
service = event ["object" ]
1004
1026
event_type = event ["type" ]
1027
+ if event_type == "DELETED" :
1028
+ if service .metadata .name in self .available_engines :
1029
+ self ._delete_engine (service .metadata .name )
1030
+ continue
1005
1031
service_name = service .metadata .name
1006
1032
is_service_ready = self ._check_service_ready (
1007
1033
service_name , self .namespace
0 commit comments