Skip to content

Commit 58193a9

Browse files
authored
Add logging for misbehaving distribution metrics (#4429)
### Motivation Some cumulative distribution metrics (build age, retrieval, testcase age, testcase triage duration) are misbehaving and capping at 1. This PR intends to aid in debugging that.
1 parent adc50ff commit 58193a9

File tree

4 files changed

+39
-23
lines changed

4 files changed

+39
-23
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,12 +1887,15 @@ def run(self):
18871887
self.fuzz_task_output.crash_groups.extend(crash_groups)
18881888

18891889
fuzzing_session_duration = time.time() - start_time
1890-
monitoring_metrics.FUZZING_SESSION_DURATION.add(
1891-
fuzzing_session_duration, {
1892-
'fuzzer': self.fuzzer_name,
1893-
'job': self.job_type,
1894-
'platform': environment.platform()
1895-
})
1890+
labels = {
1891+
'fuzzer': self.fuzzer_name,
1892+
'job': self.job_type,
1893+
'platform': environment.platform()
1894+
}
1895+
logs.info(f'FUZZING_SESSION_DURATION: add {fuzzing_session_duration} '
1896+
'for {labels}.')
1897+
monitoring_metrics.FUZZING_SESSION_DURATION.add(fuzzing_session_duration,
1898+
labels)
18961899

18971900
return uworker_msg_pb2.Output(fuzz_task_output=self.fuzz_task_output) # pylint: disable=no-member
18981901

src/clusterfuzz/_internal/build_management/build_manager.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -300,14 +300,17 @@ def set_env_var(name, value):
300300

301301

302302
def _emit_job_build_retrieval_metric(start_time, step, build_type):
303+
"""Emits a metrick to track the distribution of build retrieval times."""
303304
elapsed_minutes = (time.time() - start_time) / 60
304-
monitoring_metrics.JOB_BUILD_RETRIEVAL_TIME.add(
305-
elapsed_minutes, {
306-
'job': os.getenv('JOB_NAME'),
307-
'platform': environment.platform(),
308-
'step': step,
309-
'build_type': build_type,
310-
})
305+
labels = {
306+
'job': os.getenv('JOB_NAME'),
307+
'platform': environment.platform(),
308+
'step': step,
309+
'build_type': build_type,
310+
}
311+
logs.info(f'JOB_BUILD_RETRIEVAL_TIME: adding {elapsed_minutes} '
312+
f'for labels {labels}.')
313+
monitoring_metrics.JOB_BUILD_RETRIEVAL_TIME.add(elapsed_minutes, labels)
311314

312315

313316
class BaseBuild:
@@ -1220,6 +1223,7 @@ def _emit_build_age_metric(gcs_path):
12201223
'platform': environment.platform(),
12211224
'task': os.getenv('TASK_NAME'),
12221225
}
1226+
logs.info(f'JOB_BUILD_AGE: adding {elapsed_time_in_hours} for {labels}')
12231227
monitoring_metrics.JOB_BUILD_AGE.add(elapsed_time_in_hours, labels)
12241228
# This field is expected as a datetime object
12251229
# https://cloud.google.com/storage/docs/json_api/v1/objects#resource

src/clusterfuzz/_internal/common/testcase_utils.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,17 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
6161
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
6262
return
6363

64+
labels = {
65+
'job': testcase.job_type,
66+
'step': step,
67+
}
68+
69+
logs.info(
70+
f'TESTCASE_UPLOAD_TRIAGE_DURATION: adding {elapsed_time_since_upload} for {labels}.'
71+
)
72+
6473
monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
65-
elapsed_time_since_upload,
66-
labels={
67-
'job': testcase.job_type,
68-
'step': step,
69-
})
74+
elapsed_time_since_upload, labels=labels)
7075

7176

7277
def get_testcase_upload_metadata(

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,16 @@ def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
306306
if not testcase.timestamp:
307307
return
308308

309+
labels = {
310+
'job': testcase.job_type,
311+
'platform': testcase.platform,
312+
}
313+
314+
logs.info(f'UNTRIAGED_TESTCASE_AGE: adding {testcase.get_age_in_seconds()}'
315+
' for {labels}')
316+
309317
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
310-
testcase.get_age_in_seconds(),
311-
labels={
312-
'job': testcase.job_type,
313-
'platform': testcase.platform,
314-
})
318+
testcase.get_age_in_seconds(), labels=labels)
315319

316320

317321
def main():

0 commit comments

Comments
 (0)