1- import ray
21import time
32
4- from rock .config import RayConfig
5- from rock .logger import init_logger
6- from rock .utils .rwlock import AsyncRWLock
3+ import ray
74from apscheduler .schedulers .asyncio import AsyncIOScheduler
85from apscheduler .triggers .interval import IntervalTrigger
96
7+ from rock .config import RayConfig
8+ from rock .logger import init_logger
9+ from rock .utils .rwlock import AsyncRWLock , WriteLockTimeout
1010
1111logger = init_logger (__name__ )
1212
@@ -34,7 +34,6 @@ def increment_ray_request_count(self):
3434
3535 def get_ray_rwlock (self ):
3636 return self ._ray_rwlock
37-
3837
3938 def _setup_ray_reconnect_scheduler (self ):
4039 self ._ray_reconnection_scheduler = AsyncIOScheduler (
@@ -57,19 +56,22 @@ async def _ray_reconnect_with_policy(self):
5756 await self ._reconnect_ray ()
5857
5958 async def _reconnect_ray (self ):
60- async with self ._ray_rwlock .write_lock ():
61- start_time = time .time ()
62- logger .info (f"current time { start_time } , Reconnect ray cluster" )
63- ray .shutdown ()
64- ray .init (
65- address = self ._config .address ,
66- runtime_env = self ._config .runtime_env ,
67- namespace = self ._config .namespace ,
68- resources = self ._config .resources ,
69- )
70- self ._ray_request_count = 0
71- end_time = time .time ()
72- self ._ray_establish_time = end_time
73- logger .info (
74- f"current time { end_time } , Reconnect ray cluster successfully, duration { end_time - start_time } s"
75- )
59+ try :
60+ async with self ._ray_rwlock .write_lock (timeout = self ._config .ray_reconnect_wait_timeout_seconds ):
61+ start_time = time .time ()
62+ logger .info (f"current time { start_time } , Reconnect ray cluster" )
63+ ray .shutdown ()
64+ ray .init (
65+ address = self ._config .address ,
66+ runtime_env = self ._config .runtime_env ,
67+ namespace = self ._config .namespace ,
68+ resources = self ._config .resources ,
69+ )
70+ self ._ray_request_count = 0
71+ end_time = time .time ()
72+ self ._ray_establish_time = end_time
73+ logger .info (
74+ f"current time { end_time } , Reconnect ray cluster successfully, duration { end_time - start_time } s"
75+ )
76+ except WriteLockTimeout as e :
77+ logger .warning ("Reconnect ray cluster timeout, skip reconnectting" , exc_info = e )
0 commit comments