Skip to content

Commit 8da25b5

Browse files
authored
[Monitoring] Extend TESTCASE_UPLOAD_TRIAGE_DURATION to account for fuzzer generated test cases (#4481)
### Motivation [#4364](#4364) implemented the tracking for the time it takes clusterfuzz to complete several steps of the manually uploaded testcase lifecycle. As per Chrome's request, the metric will now contain an 'origin' label, which indicates if the testcase was 'manually_uploaded' or generated by a 'fuzzer'. The code was also simplified, by reusing the get_age_in_seconds method from the TestCase entity. Also, it adds the 'stuck_in_triage' boolean field to the testcase entity, to facilitate figuring out what testcases are in a stuck state, so follow up actions can be taken. Part of #4271
1 parent d3d1b76 commit 8da25b5

File tree

4 files changed

+47
-8
lines changed

4 files changed

+47
-8
lines changed

src/clusterfuzz/_internal/common/testcase_utils.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131

3232

3333
def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
34+
'''Finds out if a testcase is fuzzer generated or manually uploaded,
35+
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
3436
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
3537
if not testcase_upload_metadata:
3638
logs.warning(f'No upload metadata found for testcase {testcase_id},'
@@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
4547
'analyze_launched', 'analyze_completed', 'minimize_completed',
4648
'regression_completed', 'impact_completed', 'issue_updated'
4749
]
48-
elapsed_time_since_upload = datetime.datetime.utcnow()
49-
elapsed_time_since_upload -= testcase_upload_metadata.timestamp
50-
elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600
5150

5251
testcase = data_handler.get_testcase_by_id(testcase_id)
5352

@@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
6160
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
6261
return
6362

63+
from_fuzzer = not get_testcase_upload_metadata(testcase_id)
64+
65+
assert step in [
66+
'analyze_launched', 'analyze_completed', 'minimize_completed',
67+
'regression_completed', 'impact_completed', 'issue_updated'
68+
]
69+
70+
if not testcase.get_age_in_seconds():
71+
logs.warning(f'No timestamp associated to testcase {testcase_id},'
72+
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
73+
return
74+
75+
testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
76+
6477
logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
65-
f'{testcase_id} (age = {elapsed_time_since_upload}) '
78+
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
6679
'in step {step}.')
6780

6881
monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
69-
elapsed_time_since_upload,
82+
testcase_age_in_hours,
7083
labels={
7184
'job': testcase.job_type,
7285
'step': step,
86+
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
7387
})
7488

7589

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler):
309309
return filed
310310

311311

312+
def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool):
313+
if testcase.stuck_in_triage == state:
314+
return
315+
testcase.stuck_in_triage = state
316+
testcase.put()
317+
318+
312319
untriaged_testcases = {}
313320

314321

@@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status):
322329
def _emit_untriaged_testcase_count_metric():
323330
for (job, status) in untriaged_testcases:
324331
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
325-
untriaged_testcases[(job, status)], labels={
332+
untriaged_testcases[(job, status)],
333+
labels={
326334
'job': job,
327335
'status': status,
328336
})
@@ -385,18 +393,21 @@ def main():
385393

386394
# Skip if testcase's job is removed.
387395
if testcase.job_type not in all_jobs:
396+
_set_testcase_stuck_state(testcase, False)
388397
logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
389398
f' ({testcase.job_type})')
390399
continue
391400

392401
# Skip if testcase's job is in exclusions list.
393402
if testcase.job_type in excluded_jobs:
403+
_set_testcase_stuck_state(testcase, False)
394404
logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
395405
f' exclusion list ({testcase.job_type})')
396406
continue
397407

398408
# Skip if we are running progression task at this time.
399409
if testcase.get_metadata('progression_pending'):
410+
_set_testcase_stuck_state(testcase, True)
400411
logs.info(f'Skipping testcase {testcase_id}, progression pending')
401412
_emit_untriaged_testcase_age_metric(testcase)
402413
_increment_untriaged_testcase_count(testcase.job_type,
@@ -405,13 +416,15 @@ def main():
405416

406417
# If the testcase has a bug filed already, no triage is needed.
407418
if _is_bug_filed(testcase):
419+
_set_testcase_stuck_state(testcase, False)
408420
logs.info(
409421
f'Skipping testcase {testcase_id}, since a bug was already filed.')
410422
continue
411423

412424
# Check if the crash is important, i.e. it is either a reproducible crash
413425
# or an unreproducible crash happening frequently.
414426
if not _is_crash_important(testcase):
427+
_set_testcase_stuck_state(testcase, False)
415428
logs.info(
416429
f'Skipping testcase {testcase_id}, since the crash is not important.')
417430
continue
@@ -420,6 +433,7 @@ def main():
420433
# finished.
421434
if not critical_tasks_completed:
422435
_emit_untriaged_testcase_age_metric(testcase)
436+
_set_testcase_stuck_state(testcase, True)
423437
_increment_untriaged_testcase_count(testcase.job_type,
424438
PENDING_CRITICAL_TASKS)
425439
logs.info(
@@ -439,13 +453,15 @@ def main():
439453
if not testcase.group_id and not dates.time_has_expired(
440454
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
441455
_emit_untriaged_testcase_age_metric(testcase)
456+
_set_testcase_stuck_state(testcase, True)
442457
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
443458
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
444459
continue
445460

446461
if not testcase.get_metadata('ran_grouper'):
447462
# Testcase should be considered by the grouper first before filing.
448463
_emit_untriaged_testcase_age_metric(testcase)
464+
_set_testcase_stuck_state(testcase, True)
449465
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
450466
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
451467
continue
@@ -466,6 +482,7 @@ def main():
466482
# If there are similar issues to this test case already filed or recently
467483
# closed, skip filing a duplicate bug.
468484
if _check_and_update_similar_bug(testcase, issue_tracker):
485+
_set_testcase_stuck_state(testcase, False)
469486
logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
470487
' was already filed.')
471488
continue
@@ -475,6 +492,7 @@ def main():
475492

476493
# A testcase is untriaged, until immediately before a bug is opened
477494
_emit_untriaged_testcase_age_metric(testcase)
495+
_set_testcase_stuck_state(testcase, False)
478496
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
479497

480498
# File the bug first and then create filed bug metadata.

src/clusterfuzz/_internal/datastore/data_types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,9 @@ class Testcase(Model):
580580
# corpus.
581581
trusted = ndb.BooleanProperty(default=False)
582582

583+
# Tracks if a testcase is stuck during triage.
584+
stuck_in_triage = ndb.BooleanProperty(default=False)
585+
583586
def is_chromium(self):
584587
return self.project_name in ('chromium', 'chromium-testing')
585588

@@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty:
686689

687690
def get_age_in_seconds(self):
688691
current_time = datetime.datetime.utcnow()
692+
if not self.get_created_time():
693+
return None
689694
testcase_age = current_time - self.get_created_time()
690695
return testcase_age.total_seconds()
691696

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,12 +234,14 @@
234234
TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
235235
'uploaded_testcase_analysis/triage_duration_secs',
236236
description=('Time elapsed between testcase upload and completion'
237-
' of relevant tasks in the testcase upload lifecycle, '
238-
'in hours.'),
237+
' of relevant tasks in the testcase upload lifecycle.'
238+
' Origin can be either from a fuzzer, or a manual'
239+
' upload. Measured in hours.'),
239240
bucketer=monitor.GeometricBucketer(),
240241
field_spec=[
241242
monitor.StringField('step'),
242243
monitor.StringField('job'),
244+
monitor.StringField('origin'),
243245
],
244246
)
245247

0 commit comments

Comments
 (0)