Skip to content

Commit d3d1b76

Browse files
authored
[Monitoring] Partition uworker_output.ErrorType conditions into success, maybe_retry and failure outcomes (#4499)
### Motivation #4458 implemented a task outcome metric, so we can track error rates in utasks, by job/task/subtask. As failures are expected for ClusterFuzz, initially only unhandled exceptions would be considered as actual errors. Chrome folks asked for a better partitioning of error codes, which is implemented here as the following outcomes: * success: the task has unequivocally succeeded, producing a sane result * maybe_retry: some transient error happened, and the task is potentially being retried. This might capture some unretriable failure condition, but it is a compromise we are willing to make in order to decrease false positives. * failure: the task has unequivocally failed. Part of #4271
1 parent 4e53eda commit d3d1b76

File tree

1 file changed

+58
-6
lines changed

1 file changed

+58
-6
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,50 @@ def __init__(self, subtask: _Subtask):
8484
self._subtask = subtask
8585
self._labels = None
8686
self.utask_main_failure = None
87+
self._utask_success_conditions = [
88+
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
89+
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
90+
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
91+
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
92+
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
93+
uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH, # pylint: disable=no-member
94+
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
95+
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
96+
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
97+
]
98+
self._utask_maybe_retry_conditions = [
99+
uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP, # pylint: disable=no-member
100+
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST, # pylint: disable=no-member
101+
uworker_msg_pb2.ErrorType.TESTCASE_SETUP, # pylint: disable=no-member
102+
uworker_msg_pb2.ErrorType.MINIMIZE_SETUP, # pylint: disable=no-member
103+
uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE, # pylint: disable=no-member
104+
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED, # pylint: disable=no-member
105+
uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH, # pylint: disable=no-member
106+
uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT, # pylint: disable=no-member
107+
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
108+
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member
109+
uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR, # pylint: disable=no-member
110+
uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR, # pylint: disable=no-member
111+
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED, # pylint: disable=no-member
112+
uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE, # pylint: disable=no-member
113+
]
114+
self._utask_failure_conditions = [
115+
uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX, # pylint: disable=no-member
116+
uworker_msg_pb2.ErrorType.UNHANDLED, # pylint: disable=no-member
117+
uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP, # pylint: disable=no-member
118+
uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE, # pylint: disable=no-member
119+
uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER, # pylint: disable=no-member
120+
uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
121+
uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
122+
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD, # pylint: disable=no-member
123+
uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member
124+
uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member
125+
uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR, # pylint: disable=no-member
126+
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED, # pylint: disable=no-member
127+
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED, # pylint: disable=no-member
128+
uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR, # pylint: disable=no-member
129+
uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD, # pylint: disable=no-member
130+
]
87131

88132
if subtask == _Subtask.PREPROCESS:
89133
self._preprocess_start_time_ns = self.start_time_ns
@@ -125,6 +169,18 @@ def set_task_details(self,
125169
# Ensure we always have a value after this method returns.
126170
assert self._preprocess_start_time_ns is not None
127171

172+
def _infer_uworker_main_outcome(self, exc_type, uworker_error):
173+
'''Infers, on a best effort basis, whether an uworker output implies
174+
success or failure. If an unequivocal response is not possible,
175+
classifies as maybe_retry.'''
176+
if exc_type or uworker_error in self._utask_failure_conditions:
177+
outcome = 'error'
178+
elif uworker_error in self._utask_maybe_retry_conditions:
179+
outcome = 'maybe_retry'
180+
else:
181+
outcome = 'success'
182+
return outcome
183+
128184
def __exit__(self, _exc_type, _exc_value, _traceback):
129185
# Ignore exception details, let Python continue unwinding the stack.
130186

@@ -145,7 +201,8 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
145201
# The only case where a task might fail without throwing, is in
146202
# utask_main, by returning an ErrorType proto which indicates
147203
# failure.
148-
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
204+
outcome = self._infer_uworker_main_outcome(_exc_type,
205+
self.utask_main_failure)
149206
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
150207
**self._labels, 'outcome': outcome
151208
})
@@ -166,11 +223,6 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
166223
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
167224
trimmed_labels)
168225

169-
if error_condition != 'UNHANDLED_EXCEPTION':
170-
task = self._labels['task']
171-
subtask = self._labels['subtask']
172-
logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')
173-
174226

175227
def ensure_uworker_env_type_safety(uworker_env):
176228
"""Converts all values in |uworker_env| to str types.

0 commit comments

Comments
 (0)