@@ -44,6 +44,7 @@ def __init__(self, role=None, interval=None, band_to_slots=None, use_gpu=True):
4444 self ._interval = interval or DEFAULT_INFO_UPLOAD_INTERVAL
4545 self ._upload_task = None
4646 self ._upload_enabled = False
47+ self ._uploaded_future = asyncio .Future ()
4748 self ._node_ready_event = asyncio .Event ()
4849
4950 self ._use_gpu = use_gpu
@@ -54,7 +55,8 @@ def __init__(self, role=None, interval=None, band_to_slots=None, use_gpu=True):
5455 self ._disk_infos = []
5556
5657 async def __post_create__ (self ):
57- await self .upload_node_info ()
58+ self ._upload_task = asyncio .create_task (self ._periodical_upload_node_info ())
59+ await self ._uploaded_future
5860
5961 async def __pre_destroy__ (self ):
6062 self ._upload_task .cancel ()
@@ -77,10 +79,27 @@ async def _get_node_info_ref(self):
7779 NodeInfoCollectorActor .default_uid (), address = supervisor_addr
7880 )
7981
82+ async def _periodical_upload_node_info (self ):
83+ while True :
84+ try :
85+ await self .upload_node_info ()
86+ if not self ._uploaded_future .done ():
87+ self ._uploaded_future .set_result (None )
88+ except asyncio .CancelledError : # pragma: no cover
89+ break
90+ except Exception as ex : # pragma: no cover # noqa: E722 # nosec # pylint: disable=bare-except
91+ logger .error (f"Failed to upload node info: { ex } " )
92+ if not self ._uploaded_future .done ():
93+ self ._uploaded_future .set_exception (ex )
94+ try :
95+ await asyncio .sleep (self ._interval )
96+ except asyncio .CancelledError : # pragma: no cover
97+ break
98+
8099 async def mark_node_ready (self ):
81100 self ._upload_enabled = True
82101 # upload info in time to reduce latency
83- await self .upload_node_info (call_next = False , status = NodeStatus .READY )
102+ await self .upload_node_info (status = NodeStatus .READY )
84103 self ._node_ready_event .set ()
85104
86105 def is_node_ready (self ):
@@ -89,7 +108,7 @@ def is_node_ready(self):
89108 async def wait_node_ready (self ):
90109 return self ._node_ready_event .wait ()
91110
92- async def upload_node_info (self , call_next : bool = True , status : NodeStatus = None ):
111+ async def upload_node_info (self , status : NodeStatus = None ):
93112 try :
94113 if not self ._info .env :
95114 self ._info .env = await asyncio .to_thread (gather_node_env )
@@ -133,11 +152,6 @@ async def upload_node_info(self, call_next: bool = True, status: NodeStatus = No
133152 except : # noqa: E722 # nosec # pylint: disable=bare-except # pragma: no cover
134153 logger .exception (f"Failed to upload node info" )
135154 raise
136- finally :
137- if call_next :
138- self ._upload_task = self .ref ().upload_node_info .tell_delay (
139- delay = self ._interval
140- )
141155
142156 def get_bands (self ) -> Dict [BandType , int ]:
143157 band_slots = dict ()
0 commit comments