Skip to content

Commit ba9009a

Browse files
authored
[Monitoring] Collecting a metric for the age of untriaged testcases (#4381)
### Motivation Once a testcase is generated (or manually uploaded), followup tasks (analyze/progression) are started. This happens by publishing to a pubsub queue, both for the manually uploaded case, and for the fuzzer generated case. If for any reason the messages are not processed, the testcase gets stuck. To get better visibility into these stuck testcases, the UNTRIAGED_TESTCASE_AGE metric is introduced, to pinpoint how old these testcases that have not yet been triaged are(more precisely, gone through analyze/regression/impact/progression tasks). ### Attention points Testcase.timestamp mutates in analyze task: https://github.com/google/clusterfuzz/blob/6ed80851ad0f6f624c5b232b0460c405f0a018b5/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py#L589 This makes it unreliable as a source of truth for testcase creation time. To circumvent that, a new ```created``` field is added to the Testcase entity, from which we can derive the correct creation time. Since this new field will only apply for testcases created after this PR is merged, Testcase.timestamp will be used instead to calculate the testcase age when the new field is missing. ### Testing strategy Ran the triage cron locally, and verified the codepath for the metric is hit and produces sane output (reference testcase: 4505741036158976). ![image](https://github.com/user-attachments/assets/6281b44f-768a-417e-8ec1-763f132c8181) Part of #4271
1 parent 082b008 commit ba9009a

File tree

4 files changed

+54
-2
lines changed

4 files changed

+54
-2
lines changed

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,22 @@ def _file_issue(testcase, issue_tracker, throttler):
298298
return filed
299299

300300

301+
def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
302+
testcase: data_types.Testcase):
303+
"""Emmits a metric to track age of untriaged testcases."""
304+
if critical_tasks_completed:
305+
return
306+
if not testcase.timestamp:
307+
return
308+
309+
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
310+
testcase.get_age_in_seconds(),
311+
labels={
312+
'job': testcase.job_type,
313+
'platform': testcase.platform,
314+
})
315+
316+
301317
def main():
302318
"""Files bugs."""
303319
try:
@@ -328,6 +344,8 @@ def main():
328344
# Already deleted.
329345
continue
330346

347+
critical_tasks_completed = data_handler.critical_tasks_completed(testcase)
348+
331349
# Skip if testcase's job is removed.
332350
if testcase.job_type not in all_jobs:
333351
continue
@@ -336,6 +354,9 @@ def main():
336354
if testcase.job_type in excluded_jobs:
337355
continue
338356

357+
# Emmit the metric for testcases that should be triaged.
358+
_emit_untriaged_testcase_age_metric(critical_tasks_completed, testcase)
359+
339360
# Skip if we are running progression task at this time.
340361
if testcase.get_metadata('progression_pending'):
341362
continue
@@ -351,7 +372,7 @@ def main():
351372

352373
# Require that all tasks like minimizaton, regression testing, etc have
353374
# finished.
354-
if not data_handler.critical_tasks_completed(testcase):
375+
if not critical_tasks_completed:
355376
continue
356377

357378
# For testcases that are not part of a group, wait an additional time to

src/clusterfuzz/_internal/datastore/data_handler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ def store_testcase(crash, fuzzed_keys, minimized_keys, regression, fixed,
788788
testcase.archive_filename = archive_filename
789789
testcase.http_flag = http_flag
790790
testcase.timestamp = datetime.datetime.utcnow()
791+
testcase.created = testcase.timestamp
791792
testcase.gestures = gestures
792793
testcase.redzone = redzone
793794
testcase.disable_ubsan = disable_ubsan
@@ -1377,6 +1378,7 @@ def create_user_uploaded_testcase(key,
13771378
testcase.set_metadata(metadata_key, metadata_value, update_testcase=False)
13781379

13791380
testcase.timestamp = utils.utcnow()
1381+
testcase.created = testcase.timestamp
13801382
testcase.uploader_email = uploader_email
13811383
testcase.put()
13821384

src/clusterfuzz/_internal/datastore/data_types.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,9 +421,16 @@ class Testcase(Model):
421421
# File name of the original uploaded archive.
422422
archive_filename = ndb.TextProperty()
423423

424-
# Timestamp.
424+
# The time when a testcase is considered valid. This is the same as the
425+
# creation time, except for analyze task, in which this field is a
426+
# placeholder and will be refreshed.
425427
timestamp = ndb.DateTimeProperty()
426428

429+
# Source of truth for creation time. This is missing for testcases
430+
# created before it was introduced, in which case the timestamp
431+
# field will be a proxy for creation time.
432+
created = ndb.DateTimeProperty(indexed=False)
433+
427434
# Does the testcase crash stack vary b/w crashes ?
428435
flaky_stack = ndb.BooleanProperty(default=False, indexed=False)
429436

@@ -671,6 +678,17 @@ def _ensure_metadata_is_cached(self):
671678

672679
setattr(self, 'metadata_cache', cache)
673680

681+
# Returns testcase.created in case it is present, as it is
682+
# the source of truth for creation time. If missing, returns
683+
# testcase.timestamp as a proxy for creation time.
684+
def get_created_time(self) -> ndb.DateTimeProperty:
685+
return self.created if self.created else self.timestamp
686+
687+
def get_age_in_seconds(self):
688+
current_time = datetime.datetime.utcnow()
689+
testcase_age = current_time - self.get_created_time()
690+
return testcase_age.total_seconds()
691+
674692
def get_metadata(self, key=None, default=None):
675693
"""Get metadata for a test case. Slow on first access."""
676694
self._ensure_metadata_is_cached()

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,17 @@
273273
monitor.StringField('status'),
274274
])
275275

276+
UNTRIAGED_TESTCASE_AGE = monitor.CumulativeDistributionMetric(
277+
'issues/untriaged_testcase_age',
278+
description='Age of testcases that were not yet triaged '
279+
'(have not yet completed analyze, regression,'
280+
' minimization, impact task), in seconds.',
281+
bucketer=monitor.GeometricBucketer(),
282+
field_spec=[
283+
monitor.StringField('job'),
284+
monitor.StringField('platform'),
285+
])
286+
276287
ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(
277288
'task/analyze/reproducibility',
278289
description='Outcome count for analyze task.',

0 commit comments

Comments
 (0)