Report error categories (#50)

michael-kotliar · web-flow · commit 740a0e2dcc79 · 2020-12-03T19:51:05.000-05:00
* Add yield_file_content helper function with test

* Add get_error_category function to report error category

* Prevent duplicate error messages for all failed workfloe steps

* Reports only the first found error per failed task.

Error categories are sorted by priorities. If high level
error occur it might make the lower error categories
markers to appear in the log, so we need to skip them.
diff --git a/cwl_airflow/utilities/cwl.py b/cwl_airflow/utilities/cwl.py
@@ -587,7 +587,7 @@ def execute_workflow_step(
     sys.stderr = _stderr
 
     if step_status != "success":
-        raise ValueError
+        raise ValueError("Failed to run workflow step")
 
     # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
     visit_class(step_outputs, ("File",), MutationManager().unset_generation)
diff --git a/cwl_airflow/utilities/helpers.py b/cwl_airflow/utilities/helpers.py
@@ -15,6 +15,20 @@
 from typing import MutableMapping, MutableSequence
 
 
+# is not actually used anywhere
+def yield_file_content(location):
+    """
+    Yields lines from the text file.
+    \n at the end of the lines are trimmed.
+    Empty lines or with only spaces/tabs are excluded.
+    """
+
+    with open(location, "r") as input_stream:
+        for line in input_stream:
+            if line.strip():
+                yield line.strip()
+
+
 def get_compressed(data_str, reset_position=None):
     """
     Converts character string "data_str" as "utf-8" into bytes ("utf-8"
diff --git a/cwl_airflow/utilities/report.py b/cwl_airflow/utilities/report.py
@@ -6,6 +6,7 @@
 from airflow.models import Variable
 from airflow.utils.state import State
 from airflow.hooks.http_hook import HttpHook
+from airflow.configuration import conf
 
 
 CONN_ID = "process_report"
@@ -32,7 +33,67 @@ def sign_with_jwt(data):
     return data
 
 
+def get_error_category(context):
+    """
+    This function should be called only from the dag_run failure callback.
+    It's higly relies on the log files, so logging level in airflow.cfg
+    shouldn't be lower than ERROR. We load log file only for the latest task
+    retry, because the get_error_category function is called when the dag_run
+    has failed, so all previous task retries didn't bring any positive results.
+    We load logs only for the actually failed task, not for upstream_failed
+    tasks. All error categories are sorted by priority from higher level to the
+    lower one. We report only one (the highest, the first found) error category
+    per failed task. Error categories from all failed tasks are combined and
+    deduplicated. The "Failed to run workflow step" category additionally is
+    filled with failed task ids. The returned value is always a string.
+    """
+
+    ERROR_MARKERS = {
+        "Docker is not available for this tool":              "Docker or Network problems. Contact support team",
+        "ERROR - Received SIGTERM. Terminating subprocesses": "Workflow was stopped. Restart with the lower threads or memory parameters",
+        "Failed to run workflow step":                        "Workflow step(s) {} failed. Contact support team"
+    }
+
+    # docker daemon is not running; networks is unavailable to pull the docker image or it doesn't exist
+    # something took too much resources and Aiflow killed the process or something externally has stopped the task
+    # cwltool exited with error when executing workflow step
+
+    dag_run = context["dag_run"]
+    failed_tis = dag_run.get_task_instances(state=State.FAILED)
+    log_handler = next(                                              # to get access to logs
+        (
+            h for h in logging.getLogger("airflow.task").handlers
+            if h.name == conf.get("core", "task_log_reader")
+        ),
+        None
+    )
+
+    categories = set()                                               # use set to prevent duplicates
+
+    for ti in failed_tis:
+        ti.task = context["dag"].get_task(ti.task_id)                # for some reasons when retreived from DagRun we need to set "task" property from the DAG
+        try:                                                         # in case log files were deleted or unavailable
+            logs, _ = log_handler.read(ti)                           # logs is always a list.
+            for marker, category in ERROR_MARKERS.items():
+                if marker in logs[-1]:                               # logs[-1] is a string with \n from the last task retry
+                    categories.add(category)
+                    break
+        except Exception as err:
+            logging.debug(f"Failed to define the error category for task {ti.task_id}. \n {err}")
+
+    if categories:
+        return ". ".join(categories).format(", ".join( [ti.task_id for ti in failed_tis] ))  # mainly to fill in the placeholder with failed task ids
+    return "Unknown error. Contact support team"
+
+
 def post_progress(context, from_task=None):
+    """
+    If dag_run failed but this function was run from the task callback,
+    error would be always "". The "error" is not "" only when this function
+    will be called from the DAG callback, thus making it the last and the only
+    message with the meaningful error description.
+    """
+
     from_task = False if from_task is None else from_task
     try:
         dag_run = context["dag_run"]
@@ -44,7 +105,7 @@ def post_progress(context, from_task=None):
                 "dag_id": dag_run.dag_id,
                 "run_id": dag_run.run_id,
                 "progress": int(len_tis_success / len_tis * 100),
-                "error": context["reason"] if dag_run.state == State.FAILED else ""
+                "error": get_error_category(context) if dag_run.state == State.FAILED and not from_task else ""
             }
         )
         http_hook.run(endpoint=ROUTES["progress"], json={"payload": data})
@@ -105,10 +166,12 @@ def task_on_success(context):
 
 
 def task_on_failure(context):
+    # no need to post progress as it hasn't been changed
     post_status(context)
 
 
 def task_on_retry(context):
+    # no need to post progress as it hasn't been changed
     post_status(context)
 
 
@@ -118,4 +181,5 @@ def dag_on_success(context):
 
 
 def dag_on_failure(context):
+    # we need to post progress, because we will also report error in it
     post_progress(context)
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -11,6 +11,7 @@
 
 from cwl_airflow.utilities.helpers import (
     CleanAirflowImport,
+    yield_file_content,
     load_yaml,
     remove_field_from_dict,
     get_compressed,
@@ -24,6 +25,23 @@
     tempfile.tempdir = "/private/tmp"
 
 
+@pytest.mark.parametrize(
+    "location, control_count",
+    [
+        (
+            path.join(DATA_FOLDER, "jobs", "bam-bedgraph-bigwig.json"),
+            11
+        )
+    ]
+)
+def test_yield_file_content(location, control_count):
+    count = 0
+    for _ in yield_file_content(location):
+        count += 1
+    assert control_count==count, \
+        "Failed to read a proper number of lines from the text file"
+
+
 @pytest.mark.parametrize(
     "location, control_md5sum",
     [