@@ -180,6 +180,20 @@ def heart_beat_exit_unexpectedly() -> bool:
180180 pass
181181
182182
183+ def _safe_check (checker , desc : str ) -> bool :
184+ """
185+ 安全执行checker,异常时打印日志并返回False
186+ """
187+ try :
188+ return checker ()
189+ except :
190+ except_type , except_value , except_traceback = sys .exc_info ()
191+ fit_logger .warning (f"check { desc } error, error type: { except_type } , "
192+ f"value: { except_value } , trace back:\n "
193+ f"{ '' .join (traceback .format_tb (except_traceback ))} " )
194+ return False
195+
196+
183197@shutdown_on_error
184198@timer
185199def main ():
@@ -197,22 +211,8 @@ def main():
197211 fit_logger .info ("terminate main enabled." )
198212 while True :
199213 # 明确区分退出原因并打印日志
200- hb_exit = False
201- should_terminate = False
202- try :
203- hb_exit = heart_beat_exit_unexpectedly ()
204- except :
205- except_type , except_value , except_traceback = sys .exc_info ()
206- fit_logger .warning (f"check heart_beat_exit_unexpectedly error, error type: { except_type } , "
207- f"value: { except_value } , trace back:\n "
208- f"{ '' .join (traceback .format_tb (except_traceback ))} " )
209- try :
210- should_terminate = get_should_terminate_main ()
211- except :
212- except_type , except_value , except_traceback = sys .exc_info ()
213- fit_logger .warning (f"check get_should_terminate_main error, error type: { except_type } , "
214- f"value: { except_value } , trace back:\n "
215- f"{ '' .join (traceback .format_tb (except_traceback ))} " )
214+ hb_exit = _safe_check (heart_beat_exit_unexpectedly , "heart_beat_exit_unexpectedly" )
215+ should_terminate = _safe_check (get_should_terminate_main , "get_should_terminate_main" )
216216 if hb_exit :
217217 fit_logger .warning ("main process will exit due to heartbeat background job exited unexpectedly." )
218218 break
@@ -235,37 +235,27 @@ def main():
235235 fit_logger .info (f"Starting process manager with restart policy: { restart_policy .get_status ()} " )
236236
237237 while True :
238+ exit_code = None
238239 try :
239240 main_process = Process (target = main , name = 'MainProcess' )
240241 main_process .start ()
241242 fit_logger .info (f"Main process started with PID: { main_process .pid } " )
242243 main_process .join ()
243-
244- # 检查进程退出码
245244 exit_code = main_process .exitcode
246- fit_logger .info (f"Main process exited with code: { exit_code } " )
247-
248- # 使用重启策略判断是否应该重启
249- if not restart_policy .should_restart (exit_code ):
250- fit_logger .info ("Restart policy indicates no restart needed, stopping" )
251- break
245+ except Exception as e :
246+ fit_logger .error (f"Error during process management: { e } " )
247+ exit_code = - 1
252248
253- # 获取重启延迟
254- restart_delay = restart_policy .get_restart_delay ()
255- status = restart_policy .get_status ()
249+ fit_logger .info (f"Main process exited with code: { exit_code } " )
250+ # 使用重启策略判断是否应该重启
251+ if not restart_policy .should_restart (exit_code ):
252+ fit_logger .info ("Restart policy indicates no restart needed, stopping" )
253+ break
256254
257- fit_logger . warning ( f"Main process exited unexpectedly, restarting in { restart_delay :.2f } seconds... "
258- f"(attempt { status [ 'current_attempt' ] } / { status [ 'max_attempts' ] } )" )
259- time . sleep ( restart_delay )
255+ # 获取重启延迟
256+ restart_delay = restart_policy . get_restart_delay ( )
257+ status = restart_policy . get_status ( )
260258
261- except Exception as e :
262- fit_logger .error (f"Error during process management: { e } " )
263- if not restart_policy .should_restart (- 1 ): # 使用-1表示异常退出
264- fit_logger .error ("Restart policy indicates no restart needed due to errors, stopping" )
265- break
266-
267- restart_delay = restart_policy .get_restart_delay ()
268- status = restart_policy .get_status ()
269- fit_logger .warning (f"Error occurred, restarting in { restart_delay :.2f} seconds... "
270- f"(attempt { status ['current_attempt' ]} /{ status ['max_attempts' ]} )" )
271- time .sleep (restart_delay )
259+ fit_logger .warning (f"Main process exited unexpectedly, restarting in { restart_delay :.2f} seconds... "
260+ f"(attempt { status ['current_attempt' ]} /{ status ['max_attempts' ]} )" )
261+ time .sleep (restart_delay )
0 commit comments