@@ -25,7 +25,12 @@ class VllmEngineMonitor:
2525 Monitors the health of the vLLM engine and initiates a shutdown if the engine is dead.
2626 """
2727
28- def __init__ (self , runtime : DistributedRuntime , engine_client : AsyncLLM ):
28+ def __init__ (
29+ self ,
30+ runtime : DistributedRuntime ,
31+ engine_client : AsyncLLM ,
32+ shutdown_event : asyncio .Event = None ,
33+ ):
2934 if not isinstance (runtime , DistributedRuntime ):
3035 raise ValueError (
3136 f"{ self .__class__ .__name__ } requires an instance of DistributedRuntime."
@@ -37,6 +42,7 @@ def __init__(self, runtime: DistributedRuntime, engine_client: AsyncLLM):
3742
3843 self .runtime = runtime
3944 self .engine_client = engine_client
45+ self .shutdown_event = shutdown_event
4046 self ._monitor_task = asyncio .create_task (self ._check_engine_health ())
4147
4248 logger .info (
@@ -66,10 +72,41 @@ def timeout_handler(signum, frame):
6672 signal .alarm (0 )
6773
6874 async def _check_engine_health (self ):
75+ """
76+ Continuously check engine health until:
77+ 1. Engine dies (EngineDeadError) - initiate shutdown
78+ 2. Shutdown event is triggered - stop monitoring gracefully
79+ 3. Task is cancelled - cleanup
80+ """
6981 while True :
7082 try :
83+ # Check if shutdown event was triggered - stop monitoring
84+ if self .shutdown_event and self .shutdown_event .is_set ():
85+ logger .info (
86+ f"{ self .__class__ .__name__ } : Shutdown event detected, stopping engine health monitoring."
87+ )
88+ break
89+
7190 await self .engine_client .check_health ()
72- await asyncio .sleep (HEALTH_CHECK_INTERVAL )
91+
92+ # Sleep with shutdown event awareness for faster response
93+ if self .shutdown_event :
94+ try :
95+ await asyncio .wait_for (
96+ self .shutdown_event .wait (), timeout = HEALTH_CHECK_INTERVAL
97+ )
98+ # Shutdown event was set during sleep
99+ logger .info (
100+ f"{ self .__class__ .__name__ } : Shutdown event detected, stopping engine health monitoring."
101+ )
102+ break
103+ except asyncio .TimeoutError :
104+ # Normal timeout, continue monitoring
105+ pass
106+ else :
107+ # No shutdown event, just sleep normally
108+ await asyncio .sleep (HEALTH_CHECK_INTERVAL )
109+
73110 except EngineDeadError as e :
74111 logger .error (f"Traceback: { traceback .format_exc ()} " )
75112 logger .error (f"vLLM AsyncLLM health check failed: { e } " )
@@ -78,4 +115,5 @@ async def _check_engine_health(self):
78115 self .runtime .shutdown ()
79116 os ._exit (1 )
80117 except asyncio .CancelledError :
81- pass
118+ logger .debug (f"{ self .__class__ .__name__ } : Health check task cancelled." )
119+ break
0 commit comments