Skip to content

Commit 94d42ca

Browse files
authored
[Monitoring] Move histogram metrics to GeometricBucketer (#4432)
### Motivation Cumulative distribution metrics from the monitoring initiative were incorrectly set to use the fixed width bucketer, and/or width=0.05 and max_buckets=20. This caused percentile metrics to cap at 1, which was wrong behavior. This PR attempts to fix that by moving them all to Geometric Bucketer, without the aforementioned limits. It also reverts #4429 , since it apparently broke triage.py in chrome. Part of #4271
1 parent dc41f5a commit 94d42ca

File tree

5 files changed

+26
-42
lines changed

5 files changed

+26
-42
lines changed

src/clusterfuzz/_internal/bot/tasks/utasks/fuzz_task.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,15 +1908,12 @@ def run(self):
19081908
self.fuzz_task_output.crash_groups.extend(crash_groups)
19091909

19101910
fuzzing_session_duration = time.time() - start_time
1911-
labels = {
1912-
'fuzzer': self.fuzzer_name,
1913-
'job': self.job_type,
1914-
'platform': environment.platform()
1915-
}
1916-
logs.info(f'FUZZING_SESSION_DURATION: add {fuzzing_session_duration} '
1917-
'for {labels}.')
1918-
monitoring_metrics.FUZZING_SESSION_DURATION.add(fuzzing_session_duration,
1919-
labels)
1911+
monitoring_metrics.FUZZING_SESSION_DURATION.add(
1912+
fuzzing_session_duration, {
1913+
'fuzzer': self.fuzzer_name,
1914+
'job': self.job_type,
1915+
'platform': environment.platform()
1916+
})
19201917

19211918
return uworker_msg_pb2.Output(fuzz_task_output=self.fuzz_task_output) # pylint: disable=no-member
19221919

src/clusterfuzz/_internal/build_management/build_manager.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -300,17 +300,14 @@ def set_env_var(name, value):
300300

301301

302302
def _emit_job_build_retrieval_metric(start_time, step, build_type):
303-
"""Emits a metrick to track the distribution of build retrieval times."""
304303
elapsed_minutes = (time.time() - start_time) / 60
305-
labels = {
306-
'job': os.getenv('JOB_NAME'),
307-
'platform': environment.platform(),
308-
'step': step,
309-
'build_type': build_type,
310-
}
311-
logs.info(f'JOB_BUILD_RETRIEVAL_TIME: adding {elapsed_minutes} '
312-
f'for labels {labels}.')
313-
monitoring_metrics.JOB_BUILD_RETRIEVAL_TIME.add(elapsed_minutes, labels)
304+
monitoring_metrics.JOB_BUILD_RETRIEVAL_TIME.add(
305+
elapsed_minutes, {
306+
'job': os.getenv('JOB_NAME'),
307+
'platform': environment.platform(),
308+
'step': step,
309+
'build_type': build_type,
310+
})
314311

315312

316313
class BaseBuild:
@@ -1223,7 +1220,6 @@ def _emit_build_age_metric(gcs_path):
12231220
'platform': environment.platform(),
12241221
'task': os.getenv('TASK_NAME'),
12251222
}
1226-
logs.info(f'JOB_BUILD_AGE: adding {elapsed_time_in_hours} for {labels}')
12271223
monitoring_metrics.JOB_BUILD_AGE.add(elapsed_time_in_hours, labels)
12281224
# This field is expected as a datetime object
12291225
# https://cloud.google.com/storage/docs/json_api/v1/objects#resource

src/clusterfuzz/_internal/common/testcase_utils.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,12 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
6161
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
6262
return
6363

64-
labels = {
65-
'job': testcase.job_type,
66-
'step': step,
67-
}
68-
69-
logs.info(
70-
f'TESTCASE_UPLOAD_TRIAGE_DURATION: adding {elapsed_time_since_upload} for {labels}.'
71-
)
72-
7364
monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
74-
elapsed_time_since_upload, labels=labels)
65+
elapsed_time_since_upload,
66+
labels={
67+
'job': testcase.job_type,
68+
'step': step,
69+
})
7570

7671

7772
def get_testcase_upload_metadata(

src/clusterfuzz/_internal/cron/triage.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -317,16 +317,12 @@ def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
317317
if not testcase.timestamp:
318318
return
319319

320-
labels = {
321-
'job': testcase.job_type,
322-
'platform': testcase.platform,
323-
}
324-
325-
logs.info(f'UNTRIAGED_TESTCASE_AGE: adding {testcase.get_age_in_seconds()}'
326-
' for {labels}')
327-
328320
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
329-
testcase.get_age_in_seconds(), labels=labels)
321+
testcase.get_age_in_seconds(),
322+
labels={
323+
'job': testcase.job_type,
324+
'platform': testcase.platform,
325+
})
330326

331327

332328
def main():

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
JOB_BUILD_AGE = monitor.CumulativeDistributionMetric(
3636
'job/build_age',
37-
bucketer=monitor.FixedWidthBucketer(width=0.05, num_finite_buckets=20),
37+
bucketer=monitor.GeometricBucketer(),
3838
description=('Distribution of latest build\'s age in hours. '
3939
'(grouped by fuzzer/job)'),
4040
field_spec=[
@@ -46,7 +46,7 @@
4646

4747
JOB_BUILD_RETRIEVAL_TIME = monitor.CumulativeDistributionMetric(
4848
'task/build_retrieval_time',
49-
bucketer=monitor.FixedWidthBucketer(width=0.05, num_finite_buckets=20),
49+
bucketer=monitor.GeometricBucketer(),
5050
description=('Distribution of fuzz task\'s build retrieval times. '
5151
'(grouped by fuzzer/job, in minutes).'),
5252
field_spec=[
@@ -120,7 +120,7 @@
120120
# fuzzing time and the time to upload results to datastore
121121
FUZZING_SESSION_DURATION = monitor.CumulativeDistributionMetric(
122122
'task/fuzz/session/duration',
123-
bucketer=monitor.FixedWidthBucketer(width=0.05, num_finite_buckets=20),
123+
bucketer=monitor.GeometricBucketer(),
124124
description=('Total duration of fuzzing session.'),
125125
field_spec=[
126126
monitor.StringField('fuzzer'),

0 commit comments

Comments
 (0)