Skip to content

Commit 27a7045

Browse files
committed
Merge remote-tracking branch 'origin/main' into multinode
2 parents fb1c40b + 29ec900 commit 27a7045

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

lightllm/utils/health_check.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
class HealthObj:
2121
_is_health: bool = True
2222
_is_health_checking: bool = False
23+
_failure_count: int = 0
24+
_failure_threshold: int = int(os.getenv("HEALTH_FAILURE_THRESHOLD", 3))
2325
timeout: int = int(os.getenv("HEALTH_TIMEOUT", 100))
2426

2527
def begin_check(self):
@@ -29,10 +31,13 @@ def end_check(self):
2931
self._is_health_checking = False
3032

3133
def set_unhealth(self):
32-
self._is_health = False
34+
self._failure_count += 1
35+
if self._failure_count > self._failure_threshold:
36+
self._is_health = False
3337

3438
def set_health(self):
3539
self._is_health = True
40+
self._failure_count = 0
3641

3742
def is_health(self):
3843
return self._is_health
@@ -72,6 +77,7 @@ async def check_timeout(results_generator):
7277
health_obj.set_health()
7378
except asyncio.TimeoutError:
7479
health_obj.set_unhealth()
80+
logger.warning("Health check timeout!")
7581
return health_obj.is_health()
7682
except Exception as e:
7783
logger.exception(str(e))

0 commit comments

Comments
 (0)