1919import tempfile ,time
2020import traceback
2121import io
22+ from loguru import logger
2223
2324
2425@celery_app .task (name = "run_pipline_job" )
@@ -91,18 +92,42 @@ def run_pipline_job(job_uuid,user_id, user_name, user_token):
9192 # insert_pipline_job_run_task_log_error(job_uuid, f"not exists yaml config : {job_uuid}")
9293 # return False
9394 except Exception as e :
94- if job_obj is not None :
95- job_obj .status = JOB_STATUS .FAILED .value
96- # print(f"--------{job_uuid} Error occurred while executing the task: {e.__str__()}")
95+ if job_obj is not None and db_session :
96+ try :
97+ # 确保任务状态被正确更新为 FAILED
98+ job_obj .status = JOB_STATUS .FAILED .value
99+ job_obj .date_finish = get_current_time ()
100+ db_session .flush () # 先 flush 确保更改被记录
101+ db_session .commit () # 然后 commit 确保持久化
102+ except Exception as db_error :
103+ logger .error (f"Failed to update job status to FAILED in outer exception handler: { db_error } " )
104+ try :
105+ db_session .rollback ()
106+ # 重新查询 job 对象
107+ job_obj_refreshed = get_pipline_job_by_uid (db_session , job_uuid )
108+ if job_obj_refreshed :
109+ job_obj_refreshed .status = JOB_STATUS .FAILED .value
110+ job_obj_refreshed .date_finish = get_current_time ()
111+ db_session .commit ()
112+ except Exception as e2 :
113+ logger .error (f"Failed to update job status even after rollback: { e2 } " )
97114 insert_pipline_job_run_task_log_error (job_uuid , f"{ job_uuid } Error occurred while executing the task: { e .__str__ ()} " )
98115 traceback .print_exc ()
99116 return False
100117 finally :
101- if job_obj :
102- job_obj .date_finish = get_current_time ()
103- if db_session and job_obj :
104- db_session .commit ()
105- db_session .close ()
118+ if job_obj and db_session :
119+ try :
120+ # 确保完成时间被设置(如果之前没有设置)
121+ if not job_obj .date_finish :
122+ job_obj .date_finish = get_current_time ()
123+ # 如果状态还是 PROCESSING,说明异常处理可能没有正确执行,设置为 FAILED
124+ if job_obj .status == JOB_STATUS .PROCESSING .value :
125+ job_obj .status = JOB_STATUS .FAILED .value
126+ db_session .commit ()
127+ except Exception as e :
128+ logger .error (f"Failed to update job in finally block: { e } " )
129+ finally :
130+ db_session .close ()
106131 # if yaml_temp_dir and os.path.exists(yaml_temp_dir) and os.path.isdir(yaml_temp_dir):
107132 # shutil.rmtree(yaml_temp_dir)
108133 if current_process_id > 0 and current_ip is not None and work_name is not None :
@@ -183,21 +208,46 @@ def run_pipline_job_task(config,job,session,user_id, user_name, user_token):
183208 job .work_dir = work_dir
184209 job .status = JOB_STATUS .PROCESSING .value
185210 session .commit ()
186- _ , branch_name = executor .run ()
187-
188- trace_dir = os .path .join (work_dir , 'trace' )
189- first_op = list (cfg .process [0 ])[0 ]
190- count_filename = f"count-{ first_op } .txt"
191- count_filepath = os .path .join (trace_dir , count_filename )
192- data_count = 0
193- if os .path .exists (count_filepath ):
194- with open (count_filepath , 'r' ) as f :
195- data_lines = f .read ().strip ()
196- data_count = int (data_lines )
197-
198- job .data_count = data_count
199- job .status = JOB_STATUS .FINISHED .value
200- job .process_count = data_count
201- job .export_repo_id = repo_id
202- job .export_branch_name = branch_name
203- session .commit ()
211+ try :
212+ _ , branch_name = executor .run ()
213+
214+ trace_dir = os .path .join (work_dir , 'trace' )
215+ first_op = list (cfg .process [0 ])[0 ]
216+ count_filename = f"count-{ first_op } .txt"
217+ count_filepath = os .path .join (trace_dir , count_filename )
218+ data_count = 0
219+ if os .path .exists (count_filepath ):
220+ with open (count_filepath , 'r' ) as f :
221+ data_lines = f .read ().strip ()
222+ data_count = int (data_lines ) if data_lines else 0
223+
224+ job .data_count = data_count
225+ job .status = JOB_STATUS .FINISHED .value
226+ job .process_count = data_count
227+ job .export_repo_id = repo_id
228+ job .export_branch_name = branch_name
229+ session .commit ()
230+ except Exception as e :
231+ # 当操作符执行失败时,确保任务状态被正确更新为 FAILED
232+ try :
233+ # 使用 flush 和 commit 确保状态被正确保存
234+ job .status = JOB_STATUS .FAILED .value
235+ job .date_finish = get_current_time ()
236+ session .flush () # 先 flush 确保更改被记录
237+ session .commit () # 然后 commit 确保持久化
238+ except Exception as db_error :
239+ # 如果更新失败,记录错误但继续抛出原始异常
240+ logger .error (f"Failed to update job status to FAILED: { db_error } " )
241+ # 尝试回滚并重新提交
242+ try :
243+ session .rollback ()
244+ from data_celery .db .JobsManager import get_pipline_job_by_uid
245+ job_from_db = get_pipline_job_by_uid (session , job .uuid )
246+ if job_from_db :
247+ job_from_db .status = JOB_STATUS .FAILED .value
248+ job_from_db .date_finish = get_current_time ()
249+ session .commit ()
250+ except Exception as e2 :
251+ logger .error (f"Failed to update job status even after rollback: { e2 } " )
252+ insert_pipline_job_run_task_log_error (job .uuid , f"{ job .uuid } Error occurred during pipeline execution: { str (e )} " )
253+ raise
0 commit comments