Skip to content

Commit 3daf2eb

Browse files
authored
[Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track testcases stuck in analyze (#4547)
### Motivation We currently have no way to tell if analyze task was successfully executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would only track duration for tasks that did finish. An analyze_pending field is added to the Testcase entity in datastore, which is set to False by default, to True for manually uploaded testcases, and to False once analyze task postprocess runs. It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a status label, so we can know at what step the testcase is stuck, thus allowing us to alert if analyze is taking longer to finish than expected. The alert itself could be, for instance, P50 age of untriaged testcase (status=analyze_pending) > 3h. Also, this retroactively addresses comments from #4481: * Fixes docstring for emit_testcase_triage_duration_metric * Removes assertions * Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION, since it now accounts for fuzzer generated testcases * Use a boolean "from_fuzzer" field, instead of "origin" string, in TESTCASE_TRIAGE_DURATION
1 parent 764e8fe commit 3daf2eb

File tree

6 files changed

+49
-55
lines changed

6 files changed

+49
-55
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,8 @@ def _update_testcase(output):
552552
if analyze_task_output.platform_id:
553553
testcase.platform_id = analyze_task_output.platform_id
554554

555+
testcase.analyze_pending = False
556+
555557
testcase.put()
556558

557559

src/clusterfuzz/_internal/common/testcase_utils.py

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,59 +31,39 @@
3131

3232

3333
def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
34-
'''Finds out if a testcase is fuzzer generated or manually uploaded,
35-
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
36-
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
37-
if not testcase_upload_metadata:
38-
logs.warning(f'No upload metadata found for testcase {testcase_id},'
39-
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
40-
return
41-
if not testcase_upload_metadata.timestamp:
42-
logs.warning(
43-
f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
44-
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
45-
return
46-
assert step in [
47-
'analyze_launched', 'analyze_completed', 'minimize_completed',
48-
'regression_completed', 'impact_completed', 'issue_updated'
49-
]
50-
34+
"""Finds out if a testcase is fuzzer generated or manually uploaded,
35+
and emits the TESTCASE_TRIAGE_DURATION metric."""
5136
testcase = data_handler.get_testcase_by_id(testcase_id)
5237

5338
if not testcase:
5439
logs.warning(f'No testcase found with id {testcase_id},'
55-
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
40+
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
5641
return
5742

5843
if not testcase.job_type:
5944
logs.warning(f'No job_type associated to testcase {testcase_id},'
60-
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
45+
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
6146
return
6247

6348
from_fuzzer = not get_testcase_upload_metadata(testcase_id)
6449

65-
assert step in [
66-
'analyze_launched', 'analyze_completed', 'minimize_completed',
67-
'regression_completed', 'impact_completed', 'issue_updated'
68-
]
69-
7050
if not testcase.get_age_in_seconds():
7151
logs.warning(f'No timestamp associated to testcase {testcase_id},'
72-
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
52+
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
7353
return
7454

75-
testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
55+
testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)
7656

77-
logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
57+
logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
7858
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
79-
'in step {step}.')
59+
f'in step {step}, from_fuzzer: {from_fuzzer}.')
8060

81-
monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
61+
monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
8262
testcase_age_in_hours,
8363
labels={
8464
'job': testcase.job_type,
8565
'step': step,
86-
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
66+
'from_fuzzer': from_fuzzer
8767
})
8868

8969

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):
256256

257257
def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
258258
testcase_age = testcase.get_age_in_seconds()
259+
if not testcase_age:
260+
return
259261
monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
260262
testcase_age,
261263
labels={
@@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
336338
})
337339

338340

339-
def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
341+
PENDING_ANALYZE = 'pending_analyze'
342+
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
343+
PENDING_PROGRESSION = 'pending_progression'
344+
PENDING_GROUPING = 'pending_grouping'
345+
PENDING_FILING = 'pending_filing'
346+
347+
348+
def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
349+
step: str):
340350
"""Emmits a metric to track age of untriaged testcases."""
341-
if not testcase.timestamp:
351+
if not testcase.get_age_in_seconds():
342352
return
343353

344354
logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
345-
f'(age = {testcase.get_age_in_seconds()})')
355+
f'(age = {testcase.get_age_in_seconds()}), step = {step}')
346356
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
347357
testcase.get_age_in_seconds() / 3600,
348358
labels={
349359
'job': testcase.job_type,
350360
'platform': testcase.platform,
361+
'step': step,
351362
})
352363

353364

354-
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
355-
PENDING_PROGRESSION = 'pending_progression'
356-
PENDING_GROUPING = 'pending_grouping'
357-
PENDING_FILING = 'pending_filing'
358-
359-
360365
def main():
361366
"""Files bugs."""
362367
try:
@@ -409,7 +414,7 @@ def main():
409414
if testcase.get_metadata('progression_pending'):
410415
_set_testcase_stuck_state(testcase, True)
411416
logs.info(f'Skipping testcase {testcase_id}, progression pending')
412-
_emit_untriaged_testcase_age_metric(testcase)
417+
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
413418
_increment_untriaged_testcase_count(testcase.job_type,
414419
PENDING_PROGRESSION)
415420
continue
@@ -432,10 +437,12 @@ def main():
432437
# Require that all tasks like minimizaton, regression testing, etc have
433438
# finished.
434439
if not critical_tasks_completed:
435-
_emit_untriaged_testcase_age_metric(testcase)
440+
status = PENDING_CRITICAL_TASKS
441+
if testcase.analyze_pending:
442+
status = PENDING_ANALYZE
443+
_emit_untriaged_testcase_age_metric(testcase, status)
436444
_set_testcase_stuck_state(testcase, True)
437-
_increment_untriaged_testcase_count(testcase.job_type,
438-
PENDING_CRITICAL_TASKS)
445+
_increment_untriaged_testcase_count(testcase.job_type, status)
439446
logs.info(
440447
f'Skipping testcase {testcase_id}, critical tasks still pending.')
441448
continue
@@ -452,15 +459,15 @@ def main():
452459
# metadata works well.
453460
if not testcase.group_id and not dates.time_has_expired(
454461
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
455-
_emit_untriaged_testcase_age_metric(testcase)
462+
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
456463
_set_testcase_stuck_state(testcase, True)
457464
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
458465
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
459466
continue
460467

461468
if not testcase.get_metadata('ran_grouper'):
462469
# Testcase should be considered by the grouper first before filing.
463-
_emit_untriaged_testcase_age_metric(testcase)
470+
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
464471
_set_testcase_stuck_state(testcase, True)
465472
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
466473
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -490,16 +497,15 @@ def main():
490497
# Clean up old triage messages that would be not applicable now.
491498
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
492499

493-
# A testcase is untriaged, until immediately before a bug is opened
494-
_emit_untriaged_testcase_age_metric(testcase)
495-
_set_testcase_stuck_state(testcase, False)
496-
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
497-
498500
# File the bug first and then create filed bug metadata.
499501
if not _file_issue(testcase, issue_tracker, throttler):
502+
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
503+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
500504
logs.info(f'Issue filing failed for testcase id {testcase_id}')
501505
continue
502506

507+
_set_testcase_stuck_state(testcase, False)
508+
503509
_create_filed_bug_metadata(testcase)
504510
issue_filer.notify_issue_update(testcase, 'new')
505511

src/clusterfuzz/_internal/datastore/data_handler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
921921
return testcase.minimized_keys and testcase.regression
922922

923923
return bool(testcase.minimized_keys and testcase.regression and
924-
testcase.is_impact_set_flag)
924+
testcase.is_impact_set_flag and not testcase.analyze_pending)
925925

926926

927927
# ------------------------------------------------------------------------------
@@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key,
13801380

13811381
testcase.timestamp = utils.utcnow()
13821382
testcase.created = testcase.timestamp
1383+
testcase.analyze_pending = True
13831384
testcase.uploader_email = uploader_email
13841385
testcase.put()
13851386

src/clusterfuzz/_internal/datastore/data_types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,10 @@ class Testcase(Model):
583583
# Tracks if a testcase is stuck during triage.
584584
stuck_in_triage = ndb.BooleanProperty(default=False)
585585

586+
# Tracks if analyze task is pending.
587+
# Defaults to false, since most testcases are fuzzer produced.
588+
analyze_pending = ndb.BooleanProperty(default=False)
589+
586590
def is_chromium(self):
587591
return self.project_name in ('chromium', 'chromium-testing')
588592

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,17 +231,17 @@
231231
],
232232
)
233233

234-
TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
235-
'uploaded_testcase_analysis/triage_duration_secs',
234+
TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
235+
'testcase_analysis/triage_duration_hours',
236236
description=('Time elapsed between testcase upload and completion'
237-
' of relevant tasks in the testcase upload lifecycle.'
237+
' of relevant tasks in the testcase lifecycle.'
238238
' Origin can be either from a fuzzer, or a manual'
239239
' upload. Measured in hours.'),
240240
bucketer=monitor.GeometricBucketer(),
241241
field_spec=[
242242
monitor.StringField('step'),
243243
monitor.StringField('job'),
244-
monitor.StringField('origin'),
244+
monitor.BooleanField('from_fuzzer'),
245245
],
246246
)
247247

@@ -365,6 +365,7 @@
365365
field_spec=[
366366
monitor.StringField('job'),
367367
monitor.StringField('platform'),
368+
monitor.StringField('step'),
368369
])
369370

370371
UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(

0 commit comments

Comments
 (0)