Skip to content

Commit 514cec0

Browse files
authored
[Monitoring] Adding a metric for task outcome (#4458)
### Motivation We currently have no metric that tracks the error rate for each task. This PR implements that, and the error rate can be obtained by summing up the metric with outcome=failure, divided by the overall sum. This is useful for SLI alerting. Part of #4271
1 parent 591d6c5 commit 514cec0

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from clusterfuzz._internal.bot.webserver import http_server
2828
from clusterfuzz._internal.metrics import logs
2929
from clusterfuzz._internal.metrics import monitoring_metrics
30+
from clusterfuzz._internal.protos import uworker_msg_pb2
3031
from clusterfuzz._internal.system import environment
3132

3233
# Define an alias to appease pylint.
@@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager):
7475
Members:
7576
start_time_ns (int): The time at which this recorder was constructed, in
7677
nanoseconds since the Unix epoch.
78+
utask_main_failure: this class stores the uworker_output.ErrorType
79+
object returned by utask_main, and uses it to emmit a metric.
7780
"""
7881

7982
def __init__(self, subtask: _Subtask):
8083
self.start_time_ns = time.time_ns()
8184
self._subtask = subtask
8285
self._labels = None
86+
self.utask_main_failure = None
8387

8488
if subtask == _Subtask.PREPROCESS:
8589
self._preprocess_start_time_ns = self.start_time_ns
@@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
138142
monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add(
139143
e2e_duration_secs, self._labels)
140144

145+
# The only case where a task might fail without throwing, is in
146+
# utask_main, by returning an ErrorType proto which indicates
147+
# failure.
148+
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
149+
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
150+
**self._labels, 'outcome': outcome
151+
})
152+
if outcome == "success":
153+
error_condition = 'N/A'
154+
elif _exc_type:
155+
error_condition = 'UNHANDLED_EXCEPTION'
156+
else:
157+
error_condition = uworker_msg_pb2.ErrorType.Name( # pylint: disable=no-member
158+
self.utask_main_failure)
159+
# Get rid of job as a label, so we can have another metric to make
160+
# error conditions more explicit, respecting the 30k distinct
161+
# labels limit recommended by gcp.
162+
trimmed_labels = self._labels
163+
del trimmed_labels['job']
164+
trimmed_labels['outcome'] = outcome
165+
trimmed_labels['error_condition'] = error_condition
166+
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
167+
trimmed_labels)
168+
141169

142170
def ensure_uworker_env_type_safety(uworker_env):
143171
"""Converts all values in |uworker_env| to str types.
@@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input):
226254
return None
227255

228256
# NOTE: Keep this in sync with `uworker_main()`.
257+
if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member
258+
recorder.utask_main_failure = uworker_output.error_type
229259
uworker_output.bot_name = environment.get_value('BOT_NAME', '')
230260
uworker_output.platform_id = environment.get_platform_id()
231261

@@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None:
306336
logs.info('Starting utask_main: %s.' % utask_module)
307337
uworker_output = utask_module.utask_main(uworker_input)
308338

339+
if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member
340+
recorder.utask_main_failure = uworker_output.error_type
341+
309342
# NOTE: Keep this in sync with `uworker_main_no_io()`.
310343
uworker_output.bot_name = environment.get_value('BOT_NAME', '')
311344
uworker_output.platform_id = environment.get_platform_id()

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@
241241
monitor.StringField('job'),
242242
],
243243
)
244+
244245
TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
245246
'task/rate_limit',
246247
description=('Counter for rate limit events.'),
@@ -250,6 +251,30 @@
250251
monitor.StringField('argument'),
251252
])
252253

254+
TASK_OUTCOME_COUNT = monitor.CounterMetric(
255+
'task/outcome',
256+
description=('Counter metric for task outcome (success/failure).'),
257+
field_spec=[
258+
monitor.StringField('task'),
259+
monitor.StringField('job'),
260+
monitor.StringField('subtask'),
261+
monitor.StringField('mode'),
262+
monitor.StringField('platform'),
263+
monitor.StringField('outcome'),
264+
])
265+
266+
TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
267+
'task/outcome_by_error_type',
268+
description=('Counter metric for task outcome, with error type.'),
269+
field_spec=[
270+
monitor.StringField('task'),
271+
monitor.StringField('subtask'),
272+
monitor.StringField('mode'),
273+
monitor.StringField('platform'),
274+
monitor.StringField('outcome'),
275+
monitor.StringField('error_condition'),
276+
])
277+
253278
UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric(
254279
'utask/subtask_e2e_duration_secs',
255280
description=(

0 commit comments

Comments
 (0)