[Monitoring] Adding a metric for task outcome (#4458)

vitorguidi · web-flow · commit 514cec033711 · 2024-12-11T10:44:56.000+11:00
### Motivation We currently have no metric that tracks the error rate for each task. This PR implements that, and the error rate can be obtained by summing up the metric with outcome=failure, divided by the overall sum. This is useful for SLI alerting. Part of #4271
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -27,6 +27,7 @@
 from clusterfuzz._internal.bot.webserver import http_server
 from clusterfuzz._internal.metrics import logs
 from clusterfuzz._internal.metrics import monitoring_metrics
+from clusterfuzz._internal.protos import uworker_msg_pb2
 from clusterfuzz._internal.system import environment
 
 # Define an alias to appease pylint.
@@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager):
   Members:
     start_time_ns (int): The time at which this recorder was constructed, in
       nanoseconds since the Unix epoch.
+    utask_main_failure: this class stores the uworker_output.ErrorType 
+      object returned by utask_main, and uses it to emmit a metric.
   """
 
   def __init__(self, subtask: _Subtask):
     self.start_time_ns = time.time_ns()
     self._subtask = subtask
     self._labels = None
+    self.utask_main_failure = None
 
     if subtask == _Subtask.PREPROCESS:
       self._preprocess_start_time_ns = self.start_time_ns
@@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add(
         e2e_duration_secs, self._labels)
 
+    # The only case where a task might fail without throwing, is in
+    # utask_main, by returning an ErrorType proto which indicates
+    # failure.
+    outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
+    monitoring_metrics.TASK_OUTCOME_COUNT.increment({
+        **self._labels, 'outcome': outcome
+    })
+    if outcome == "success":
+      error_condition = 'N/A'
+    elif _exc_type:
+      error_condition = 'UNHANDLED_EXCEPTION'
+    else:
+      error_condition = uworker_msg_pb2.ErrorType.Name(  # pylint: disable=no-member
+          self.utask_main_failure)
+    # Get rid of job as a label, so we can have another metric to make
+    # error conditions more explicit, respecting the 30k distinct
+    # labels limit recommended by gcp.
+    trimmed_labels = self._labels
+    del trimmed_labels['job']
+    trimmed_labels['outcome'] = outcome
+    trimmed_labels['error_condition'] = error_condition
+    monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
+        trimmed_labels)
+
 
 def ensure_uworker_env_type_safety(uworker_env):
   """Converts all values in |uworker_env| to str types.
@@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input):
       return None
 
     # NOTE: Keep this in sync with `uworker_main()`.
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
 
@@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None:
     logs.info('Starting utask_main: %s.' % utask_module)
     uworker_output = utask_module.utask_main(uworker_input)
 
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
+
     # NOTE: Keep this in sync with `uworker_main_no_io()`.
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -241,6 +241,7 @@
         monitor.StringField('job'),
     ],
 )
+
 TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
     'task/rate_limit',
     description=('Counter for rate limit events.'),
@@ -250,6 +251,30 @@
         monitor.StringField('argument'),
     ])
 
+TASK_OUTCOME_COUNT = monitor.CounterMetric(
+    'task/outcome',
+    description=('Counter metric for task outcome (success/failure).'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('job'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+    ])
+
+TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
+    'task/outcome_by_error_type',
+    description=('Counter metric for task outcome, with error type.'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+        monitor.StringField('error_condition'),
+    ])
+
 UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric(
     'utask/subtask_e2e_duration_secs',
     description=(