Skip to content

Commit 1208d14

Browse files
authored
[Monitoring] Fix untriaged testcase age oversampling, add untriaged testcase count (#4494)
### Motivation #4364 implemented a metric to track the percentile distributions of untriaged testcase age. It was overcounting testcases: * Testcases for which a bug was already filed * Testcases for which the crash was unimportant This PR solves this issue, and adds the UNTRIAGED_TESTCASE_COUNT metric, drilled down by job and platform, so we can also know how many testcases are stuck, and not only their age distribution.
1 parent 187b18d commit 1208d14

File tree

2 files changed

+39
-7
lines changed

2 files changed

+39
-7
lines changed

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,22 @@ def _file_issue(testcase, issue_tracker, throttler):
309309
return filed
310310

311311

312-
def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
313-
testcase: data_types.Testcase):
312+
untriaged_testcase_count = {}
313+
314+
315+
def _increment_untriaged_testcase_count(testcase: data_types.Testcase):
316+
identifier = (testcase.job_type, testcase.platform)
317+
if identifier not in untriaged_testcase_count:
318+
untriaged_testcase_count[identifier] = 0
319+
untriaged_testcase_count[identifier] += 1
320+
321+
322+
def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
314323
"""Emmits a metric to track age of untriaged testcases."""
315-
if critical_tasks_completed:
316-
return
317324
if not testcase.timestamp:
318325
return
319326

327+
_increment_untriaged_testcase_count(testcase)
320328
logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
321329
f'(age = {testcase.get_age_in_seconds()})')
322330
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
@@ -327,6 +335,16 @@ def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
327335
})
328336

329337

338+
def _emit_untriaged_testcase_count_metric():
339+
for (job, platform) in untriaged_testcase_count:
340+
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
341+
untriaged_testcase_count[(job, platform)],
342+
labels={
343+
'job': job,
344+
'platform': platform,
345+
})
346+
347+
330348
def main():
331349
"""Files bugs."""
332350
try:
@@ -373,12 +391,10 @@ def main():
373391
f' exclusion list ({testcase.job_type})')
374392
continue
375393

376-
# Emmit the metric for testcases that should be triaged.
377-
_emit_untriaged_testcase_age_metric(critical_tasks_completed, testcase)
378-
379394
# Skip if we are running progression task at this time.
380395
if testcase.get_metadata('progression_pending'):
381396
logs.info(f'Skipping testcase {testcase_id}, progression pending')
397+
_emit_untriaged_testcase_age_metric(testcase)
382398
continue
383399

384400
# If the testcase has a bug filed already, no triage is needed.
@@ -397,6 +413,7 @@ def main():
397413
# Require that all tasks like minimizaton, regression testing, etc have
398414
# finished.
399415
if not critical_tasks_completed:
416+
_emit_untriaged_testcase_age_metric(testcase)
400417
logs.info(
401418
f'Skipping testcase {testcase_id}, critical tasks still pending.')
402419
continue
@@ -413,11 +430,13 @@ def main():
413430
# metadata works well.
414431
if not testcase.group_id and not dates.time_has_expired(
415432
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
433+
_emit_untriaged_testcase_age_metric(testcase)
416434
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
417435
continue
418436

419437
if not testcase.get_metadata('ran_grouper'):
420438
# Testcase should be considered by the grouper first before filing.
439+
_emit_untriaged_testcase_age_metric(testcase)
421440
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
422441
continue
423442

@@ -447,6 +466,7 @@ def main():
447466
# File the bug first and then create filed bug metadata.
448467
if not _file_issue(testcase, issue_tracker, throttler):
449468
logs.info(f'Issue filing failed for testcase id {testcase_id}')
469+
_emit_untriaged_testcase_age_metric(testcase)
450470
continue
451471

452472
_create_filed_bug_metadata(testcase)
@@ -455,6 +475,8 @@ def main():
455475
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
456476
testcase_id))
457477

478+
_emit_untriaged_testcase_count_metric()
479+
458480
logs.info('Triage testcases succeeded.')
459481
return True
460482

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,16 @@
365365
monitor.StringField('platform'),
366366
])
367367

368+
UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(
369+
'issues/untriaged_testcase_count',
370+
description='Number of testcases that were not yet triaged '
371+
'(have not yet completed analyze, regression,'
372+
' minimization, impact task), in hours.',
373+
field_spec=[
374+
monitor.StringField('job'),
375+
monitor.StringField('platform'),
376+
])
377+
368378
ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(
369379
'task/analyze/reproducibility',
370380
description='Outcome count for analyze task.',

0 commit comments

Comments
 (0)