66from airflow .models import Variable
77from airflow .utils .state import State
88from airflow .hooks .http_hook import HttpHook
9+ from airflow .configuration import conf
910
1011
1112CONN_ID = "process_report"
@@ -32,7 +33,67 @@ def sign_with_jwt(data):
3233 return data
3334
3435
36+ def get_error_category (context ):
37+ """
38+ This function should be called only from the dag_run failure callback.
39+ It's higly relies on the log files, so logging level in airflow.cfg
40+ shouldn't be lower than ERROR. We load log file only for the latest task
41+ retry, because the get_error_category function is called when the dag_run
42+ has failed, so all previous task retries didn't bring any positive results.
43+ We load logs only for the actually failed task, not for upstream_failed
44+ tasks. All error categories are sorted by priority from higher level to the
45+ lower one. We report only one (the highest, the first found) error category
46+ per failed task. Error categories from all failed tasks are combined and
47+ deduplicated. The "Failed to run workflow step" category additionally is
48+ filled with failed task ids. The returned value is always a string.
49+ """
50+
51+ ERROR_MARKERS = {
52+ "Docker is not available for this tool" : "Docker or Network problems. Contact support team" ,
53+ "ERROR - Received SIGTERM. Terminating subprocesses" : "Workflow was stopped. Restart with the lower threads or memory parameters" ,
54+ "Failed to run workflow step" : "Workflow step(s) {} failed. Contact support team"
55+ }
56+
57+ # docker daemon is not running; networks is unavailable to pull the docker image or it doesn't exist
58+ # something took too much resources and Aiflow killed the process or something externally has stopped the task
59+ # cwltool exited with error when executing workflow step
60+
61+ dag_run = context ["dag_run" ]
62+ failed_tis = dag_run .get_task_instances (state = State .FAILED )
63+ log_handler = next ( # to get access to logs
64+ (
65+ h for h in logging .getLogger ("airflow.task" ).handlers
66+ if h .name == conf .get ("core" , "task_log_reader" )
67+ ),
68+ None
69+ )
70+
71+ categories = set () # use set to prevent duplicates
72+
73+ for ti in failed_tis :
74+ ti .task = context ["dag" ].get_task (ti .task_id ) # for some reasons when retreived from DagRun we need to set "task" property from the DAG
75+ try : # in case log files were deleted or unavailable
76+ logs , _ = log_handler .read (ti ) # logs is always a list.
77+ for marker , category in ERROR_MARKERS .items ():
78+ if marker in logs [- 1 ]: # logs[-1] is a string with \n from the last task retry
79+ categories .add (category )
80+ break
81+ except Exception as err :
82+ logging .debug (f"Failed to define the error category for task { ti .task_id } . \n { err } " )
83+
84+ if categories :
85+ return ". " .join (categories ).format (", " .join ( [ti .task_id for ti in failed_tis ] )) # mainly to fill in the placeholder with failed task ids
86+ return "Unknown error. Contact support team"
87+
88+
3589def post_progress (context , from_task = None ):
90+ """
91+ If dag_run failed but this function was run from the task callback,
92+ error would be always "". The "error" is not "" only when this function
93+ will be called from the DAG callback, thus making it the last and the only
94+ message with the meaningful error description.
95+ """
96+
3697 from_task = False if from_task is None else from_task
3798 try :
3899 dag_run = context ["dag_run" ]
@@ -44,7 +105,7 @@ def post_progress(context, from_task=None):
44105 "dag_id" : dag_run .dag_id ,
45106 "run_id" : dag_run .run_id ,
46107 "progress" : int (len_tis_success / len_tis * 100 ),
47- "error" : context [ "reason" ] if dag_run .state == State .FAILED else ""
108+ "error" : get_error_category ( context ) if dag_run .state == State .FAILED and not from_task else ""
48109 }
49110 )
50111 http_hook .run (endpoint = ROUTES ["progress" ], json = {"payload" : data })
@@ -105,10 +166,12 @@ def task_on_success(context):
105166
106167
107168def task_on_failure (context ):
169+ # no need to post progress as it hasn't been changed
108170 post_status (context )
109171
110172
111173def task_on_retry (context ):
174+ # no need to post progress as it hasn't been changed
112175 post_status (context )
113176
114177
@@ -118,4 +181,5 @@ def dag_on_success(context):
118181
119182
120183def dag_on_failure (context ):
184+ # we need to post progress, because we will also report error in it
121185 post_progress (context )
0 commit comments