Skip to content

Commit 764e8fe

Browse files
authored
[Monitoring] Partition UTask outcomes correctly into success and error (#4516)
### Motivation #4458 implemented an error rate for utasks, only considering exceptions. In #4499 , outcomes were split between success, failure and maybe_retry conditions. There we learned that the volume of retryable outcomes is negligible, so it makes sense to count them as failures. Listing out all the success conditions under _MetricRecorder is not desirable. However, we are consciously taking this technical debt so we can deliver #4271 . A refactor of uworker main will be later performed, so we can split the success and failure conditions, both of which are mixed in uworker_output.ErrorType. Reference for tech debt acknowledgement: #4517
1 parent db280ad commit 764e8fe

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask):
8484
self._subtask = subtask
8585
self._labels = None
8686
self.utask_main_failure = None
87+
self._utask_success_conditions = [
88+
None, # This can be a successful return value in, ie, fuzz task
89+
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
90+
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
91+
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
92+
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
93+
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
94+
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
95+
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
96+
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
97+
]
8798

8899
if subtask == _Subtask.PREPROCESS:
89100
self._preprocess_start_time_ns = self.start_time_ns
@@ -125,6 +136,12 @@ def set_task_details(self,
125136
# Ensure we always have a value after this method returns.
126137
assert self._preprocess_start_time_ns is not None
127138

139+
def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool:
140+
"""Returns True if task succeeded, False otherwise."""
141+
if exc_type or uworker_error not in self._utask_success_conditions:
142+
return False
143+
return True
144+
128145
def __exit__(self, _exc_type, _exc_value, _traceback):
129146
# Ignore exception details, let Python continue unwinding the stack.
130147

@@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
145162
# The only case where a task might fail without throwing, is in
146163
# utask_main, by returning an ErrorType proto which indicates
147164
# failure.
148-
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
165+
task_succeeded = self._infer_uworker_main_outcome(_exc_type,
166+
self.utask_main_failure)
149167
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
150-
**self._labels, 'outcome': outcome
168+
**self._labels, 'task_succeeded': task_succeeded
151169
})
152-
if outcome == "success":
170+
if task_succeeded:
153171
error_condition = 'N/A'
154172
elif _exc_type:
155173
error_condition = 'UNHANDLED_EXCEPTION'
@@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
161179
# labels limit recommended by gcp.
162180
trimmed_labels = self._labels
163181
del trimmed_labels['job']
164-
trimmed_labels['outcome'] = outcome
182+
trimmed_labels['task_succeeded'] = task_succeeded
165183
trimmed_labels['error_condition'] = error_condition
166184
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
167185
trimmed_labels)
168186

169-
if error_condition != 'UNHANDLED_EXCEPTION':
170-
task = self._labels['task']
171-
subtask = self._labels['subtask']
172-
logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')
173-
174187

175188
def ensure_uworker_env_type_safety(uworker_env):
176189
"""Converts all values in |uworker_env| to str types.

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@
263263
monitor.StringField('subtask'),
264264
monitor.StringField('mode'),
265265
monitor.StringField('platform'),
266-
monitor.StringField('outcome'),
266+
monitor.BooleanField('task_succeeded'),
267267
])
268268

269269
TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
@@ -274,7 +274,7 @@
274274
monitor.StringField('subtask'),
275275
monitor.StringField('mode'),
276276
monitor.StringField('platform'),
277-
monitor.StringField('outcome'),
277+
monitor.BooleanField('task_succeeded'),
278278
monitor.StringField('error_condition'),
279279
])
280280

0 commit comments

Comments
 (0)