Skip to content

Commit 4d2a896

Browse files
jh-nvdagil-nvidia
andauthored
fix: fix vllm graceful shutdown (#5818) (#5835)
Co-authored-by: dagil-nvidia <[email protected]>
1 parent edf6258 commit 4d2a896

File tree

4 files changed

+230
-115
lines changed

4 files changed

+230
-115
lines changed

components/src/dynamo/vllm/engine_monitor.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ class VllmEngineMonitor:
2525
Monitors the health of the vLLM engine and initiates a shutdown if the engine is dead.
2626
"""
2727

28-
def __init__(self, runtime: DistributedRuntime, engine_client: AsyncLLM):
28+
def __init__(
29+
self,
30+
runtime: DistributedRuntime,
31+
engine_client: AsyncLLM,
32+
shutdown_event: asyncio.Event = None,
33+
):
2934
if not isinstance(runtime, DistributedRuntime):
3035
raise ValueError(
3136
f"{self.__class__.__name__} requires an instance of DistributedRuntime."
@@ -37,6 +42,7 @@ def __init__(self, runtime: DistributedRuntime, engine_client: AsyncLLM):
3742

3843
self.runtime = runtime
3944
self.engine_client = engine_client
45+
self.shutdown_event = shutdown_event
4046
self._monitor_task = asyncio.create_task(self._check_engine_health())
4147

4248
logger.info(
@@ -66,10 +72,41 @@ def timeout_handler(signum, frame):
6672
signal.alarm(0)
6773

6874
async def _check_engine_health(self):
75+
"""
76+
Continuously check engine health until:
77+
1. Engine dies (EngineDeadError) - initiate shutdown
78+
2. Shutdown event is triggered - stop monitoring gracefully
79+
3. Task is cancelled - cleanup
80+
"""
6981
while True:
7082
try:
83+
# Check if shutdown event was triggered - stop monitoring
84+
if self.shutdown_event and self.shutdown_event.is_set():
85+
logger.info(
86+
f"{self.__class__.__name__}: Shutdown event detected, stopping engine health monitoring."
87+
)
88+
break
89+
7190
await self.engine_client.check_health()
72-
await asyncio.sleep(HEALTH_CHECK_INTERVAL)
91+
92+
# Sleep with shutdown event awareness for faster response
93+
if self.shutdown_event:
94+
try:
95+
await asyncio.wait_for(
96+
self.shutdown_event.wait(), timeout=HEALTH_CHECK_INTERVAL
97+
)
98+
# Shutdown event was set during sleep
99+
logger.info(
100+
f"{self.__class__.__name__}: Shutdown event detected, stopping engine health monitoring."
101+
)
102+
break
103+
except asyncio.TimeoutError:
104+
# Normal timeout, continue monitoring
105+
pass
106+
else:
107+
# No shutdown event, just sleep normally
108+
await asyncio.sleep(HEALTH_CHECK_INTERVAL)
109+
73110
except EngineDeadError as e:
74111
logger.error(f"Traceback: {traceback.format_exc()}")
75112
logger.error(f"vLLM AsyncLLM health check failed: {e}")
@@ -78,4 +115,5 @@ async def _check_engine_health(self):
78115
self.runtime.shutdown()
79116
os._exit(1)
80117
except asyncio.CancelledError:
81-
pass
118+
logger.debug(f"{self.__class__.__name__}: Health check task cancelled.")
119+
break

0 commit comments

Comments
 (0)