Skip to content

Commit 2f37515

Browse files
authored
[Monitoring] Enable job and status granularity in UNTRIAGED_TESTCASE_COUNT (#4500)
### Motivation The UNTRIAGED_TESTCASE_COUNT metric worked as expected, however it is hard to figure out which job the testcase belongs to, and why the testcase is stuck. This PR adds the job label, and a status one, which can be one of four values: PENDING_CRITICAL_TASKS = 'pending_critical_tasks' PENDING_PROGRESSION = 'pending_progression' PENDING_GROUPING = 'pending_grouping' PENDING_FILING = 'pending_filing' Since (job, status) tuples might disappear from one triage run to the next, the period of time the gauge is valid for should be, at most, the interval at which the triage cronjob runs, otherwise it might overcount things. Part of #4271
1 parent 667338c commit 2f37515

File tree

2 files changed

+37
-11
lines changed

2 files changed

+37
-11
lines changed

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,25 @@ def _file_issue(testcase, issue_tracker, throttler):
309309
return filed
310310

311311

312+
untriaged_testcases = {}
313+
314+
315+
def _increment_untriaged_testcase_count(job, status):
316+
identifier = (job, status)
317+
if identifier not in untriaged_testcases:
318+
untriaged_testcases[identifier] = 0
319+
untriaged_testcases[identifier] += 1
320+
321+
322+
def _emit_untriaged_testcase_count_metric():
323+
for (job, status) in untriaged_testcases:
324+
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
325+
untriaged_testcases, labels={
326+
'job': job,
327+
'status': status,
328+
})
329+
330+
312331
def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
313332
"""Emmits a metric to track age of untriaged testcases."""
314333
if not testcase.timestamp:
@@ -324,6 +343,12 @@ def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
324343
})
325344

326345

346+
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
347+
PENDING_PROGRESSION = 'pending_progression'
348+
PENDING_GROUPING = 'pending_grouping'
349+
PENDING_FILING = 'pending_filing'
350+
351+
327352
def main():
328353
"""Files bugs."""
329354
try:
@@ -346,8 +371,6 @@ def main():
346371

347372
throttler = Throttler()
348373

349-
untriaged_testcases = 0
350-
351374
for testcase_id in data_handler.get_open_testcase_id_iterator():
352375
logs.info(f'Triaging {testcase_id}')
353376
try:
@@ -376,7 +399,8 @@ def main():
376399
if testcase.get_metadata('progression_pending'):
377400
logs.info(f'Skipping testcase {testcase_id}, progression pending')
378401
_emit_untriaged_testcase_age_metric(testcase)
379-
untriaged_testcases += 1
402+
_increment_untriaged_testcase_count(testcase.job_type,
403+
PENDING_PROGRESSION)
380404
continue
381405

382406
# If the testcase has a bug filed already, no triage is needed.
@@ -396,7 +420,8 @@ def main():
396420
# finished.
397421
if not critical_tasks_completed:
398422
_emit_untriaged_testcase_age_metric(testcase)
399-
untriaged_testcases += 1
423+
_increment_untriaged_testcase_count(testcase.job_type,
424+
PENDING_CRITICAL_TASKS)
400425
logs.info(
401426
f'Skipping testcase {testcase_id}, critical tasks still pending.')
402427
continue
@@ -414,14 +439,14 @@ def main():
414439
if not testcase.group_id and not dates.time_has_expired(
415440
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
416441
_emit_untriaged_testcase_age_metric(testcase)
417-
untriaged_testcases += 1
442+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
418443
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
419444
continue
420445

421446
if not testcase.get_metadata('ran_grouper'):
422447
# Testcase should be considered by the grouper first before filing.
423448
_emit_untriaged_testcase_age_metric(testcase)
424-
untriaged_testcases += 1
449+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
425450
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
426451
continue
427452

@@ -450,7 +475,7 @@ def main():
450475

451476
# A testcase is untriaged, until immediately before a bug is opened
452477
_emit_untriaged_testcase_age_metric(testcase)
453-
untriaged_testcases += 1
478+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
454479

455480
# File the bug first and then create filed bug metadata.
456481
if not _file_issue(testcase, issue_tracker, throttler):
@@ -463,9 +488,7 @@ def main():
463488
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
464489
testcase_id))
465490

466-
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
467-
untriaged_testcases, labels={})
468-
491+
_emit_untriaged_testcase_count_metric()
469492
logs.info('Triage testcases succeeded.')
470493
return True
471494

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,10 @@
370370
description='Number of testcases that were not yet triaged '
371371
'(have not yet completed analyze, regression,'
372372
' minimization, impact task), in hours.',
373-
field_spec=[],
373+
field_spec=[
374+
monitor.StringField('job'),
375+
monitor.StringField('status'),
376+
],
374377
)
375378

376379
ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(

0 commit comments

Comments
 (0)