Skip to content

Commit c748c25

Browse files
author
Eric Harmeling
committed
metrics: refactor histogram bucket generation and testing
This commit refactors histogram bucketing for legibility and composibility. It also introduces a data-driven test for histogram bucket generation. This refactor should make it easier to add additional metric categories, distributions, and bucket types. Part of cockroachdb#97144. Release note: None
1 parent 1f8fa96 commit c748c25

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+736
-675
lines changed

pkg/ccl/changefeedccl/metrics.go

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -552,52 +552,52 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
552552
EmittedMessages: b.Counter(metaChangefeedEmittedMessages),
553553
FilteredMessages: b.Counter(metaChangefeedFilteredMessages),
554554
MessageSize: b.Histogram(metric.HistogramOptions{
555-
Metadata: metaMessageSize,
556-
Duration: histogramWindow,
557-
MaxVal: 10 << 20, /* 10MB max message size */
558-
SigFigs: 1,
559-
Buckets: metric.DataSize16MBBuckets,
555+
Metadata: metaMessageSize,
556+
Duration: histogramWindow,
557+
MaxVal: 10 << 20, /* 10MB max message size */
558+
SigFigs: 1,
559+
BucketConfig: metric.DataSize16MBBuckets,
560560
}),
561561
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
562562
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
563563
Flushes: b.Counter(metaChangefeedFlushes),
564564
SizeBasedFlushes: b.Counter(metaSizeBasedFlushes),
565565
ParallelIOQueueNanos: b.Histogram(metric.HistogramOptions{
566-
Metadata: metaChangefeedParallelIOQueueNanos,
567-
Duration: histogramWindow,
568-
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
569-
SigFigs: 2,
570-
Buckets: metric.BatchProcessLatencyBuckets,
566+
Metadata: metaChangefeedParallelIOQueueNanos,
567+
Duration: histogramWindow,
568+
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
569+
SigFigs: 2,
570+
BucketConfig: metric.BatchProcessLatencyBuckets,
571571
}),
572572
SinkIOInflight: b.Gauge(metaChangefeedSinkIOInflight),
573573

574574
BatchHistNanos: b.Histogram(metric.HistogramOptions{
575-
Metadata: metaChangefeedBatchHistNanos,
576-
Duration: histogramWindow,
577-
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
578-
SigFigs: 1,
579-
Buckets: metric.BatchProcessLatencyBuckets,
575+
Metadata: metaChangefeedBatchHistNanos,
576+
Duration: histogramWindow,
577+
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
578+
SigFigs: 1,
579+
BucketConfig: metric.BatchProcessLatencyBuckets,
580580
}),
581581
FlushHistNanos: b.Histogram(metric.HistogramOptions{
582-
Metadata: metaChangefeedFlushHistNanos,
583-
Duration: histogramWindow,
584-
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
585-
SigFigs: 2,
586-
Buckets: metric.BatchProcessLatencyBuckets,
582+
Metadata: metaChangefeedFlushHistNanos,
583+
Duration: histogramWindow,
584+
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
585+
SigFigs: 2,
586+
BucketConfig: metric.BatchProcessLatencyBuckets,
587587
}),
588588
CommitLatency: b.Histogram(metric.HistogramOptions{
589-
Metadata: metaCommitLatency,
590-
Duration: histogramWindow,
591-
MaxVal: commitLatencyMaxValue.Nanoseconds(),
592-
SigFigs: 1,
593-
Buckets: metric.BatchProcessLatencyBuckets,
589+
Metadata: metaCommitLatency,
590+
Duration: histogramWindow,
591+
MaxVal: commitLatencyMaxValue.Nanoseconds(),
592+
SigFigs: 1,
593+
BucketConfig: metric.BatchProcessLatencyBuckets,
594594
}),
595595
AdmitLatency: b.Histogram(metric.HistogramOptions{
596-
Metadata: metaAdmitLatency,
597-
Duration: histogramWindow,
598-
MaxVal: admitLatencyMaxValue.Nanoseconds(),
599-
SigFigs: 1,
600-
Buckets: metric.BatchProcessLatencyBuckets,
596+
Metadata: metaAdmitLatency,
597+
Duration: histogramWindow,
598+
MaxVal: admitLatencyMaxValue.Nanoseconds(),
599+
SigFigs: 1,
600+
BucketConfig: metric.BatchProcessLatencyBuckets,
601601
}),
602602
BackfillCount: b.Gauge(metaChangefeedBackfillCount),
603603
BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges),
@@ -712,27 +712,27 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
712712
Failures: metric.NewCounter(metaChangefeedFailures),
713713
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
714714
CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{
715-
Metadata: metaChangefeedCheckpointHistNanos,
716-
Duration: histogramWindow,
717-
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
718-
SigFigs: 2,
719-
Buckets: metric.IOLatencyBuckets,
715+
Metadata: metaChangefeedCheckpointHistNanos,
716+
Duration: histogramWindow,
717+
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
718+
SigFigs: 2,
719+
BucketConfig: metric.IOLatencyBuckets,
720720
}),
721721
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
722722
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
723723
// Below two metrics were never implemented using the hdr histogram. Set ForceUsePrometheus
724724
// to true.
725725
ParallelConsumerFlushNanos: metric.NewHistogram(metric.HistogramOptions{
726-
Metadata: metaChangefeedEventConsumerFlushNanos,
727-
Duration: histogramWindow,
728-
Buckets: metric.IOLatencyBuckets,
729-
Mode: metric.HistogramModePrometheus,
726+
Metadata: metaChangefeedEventConsumerFlushNanos,
727+
Duration: histogramWindow,
728+
BucketConfig: metric.IOLatencyBuckets,
729+
Mode: metric.HistogramModePrometheus,
730730
}),
731731
ParallelConsumerConsumeNanos: metric.NewHistogram(metric.HistogramOptions{
732-
Metadata: metaChangefeedEventConsumerConsumeNanos,
733-
Duration: histogramWindow,
734-
Buckets: metric.IOLatencyBuckets,
735-
Mode: metric.HistogramModePrometheus,
732+
Metadata: metaChangefeedEventConsumerConsumeNanos,
733+
Duration: histogramWindow,
734+
BucketConfig: metric.IOLatencyBuckets,
735+
Mode: metric.HistogramModePrometheus,
736736
}),
737737
ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents),
738738
}

pkg/ccl/sqlproxyccl/connector_test.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -381,10 +381,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
381381
c := &connector{
382382
TenantID: roachpb.MustMakeTenantID(42),
383383
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
384-
Mode: metric.HistogramModePrometheus,
385-
Metadata: metaDialTenantLatency,
386-
Duration: time.Millisecond,
387-
Buckets: metric.IOLatencyBuckets,
384+
Mode: metric.HistogramModePrometheus,
385+
Metadata: metaDialTenantLatency,
386+
Duration: time.Millisecond,
387+
BucketConfig: metric.IOLatencyBuckets,
388388
}),
389389
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
390390
}
@@ -466,10 +466,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
466466

467467
c := &connector{
468468
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
469-
Mode: metric.HistogramModePreferHdrLatency,
470-
Metadata: metaDialTenantLatency,
471-
Duration: time.Millisecond,
472-
Buckets: metric.IOLatencyBuckets,
469+
Mode: metric.HistogramModePreferHdrLatency,
470+
Metadata: metaDialTenantLatency,
471+
Duration: time.Millisecond,
472+
BucketConfig: metric.IOLatencyBuckets,
473473
}),
474474
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
475475
}
@@ -500,10 +500,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
500500
c := &connector{
501501
TenantID: roachpb.MustMakeTenantID(42),
502502
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
503-
Mode: metric.HistogramModePreferHdrLatency,
504-
Metadata: metaDialTenantLatency,
505-
Duration: time.Millisecond,
506-
Buckets: metric.IOLatencyBuckets,
503+
Mode: metric.HistogramModePreferHdrLatency,
504+
Metadata: metaDialTenantLatency,
505+
Duration: time.Millisecond,
506+
BucketConfig: metric.IOLatencyBuckets,
507507
}),
508508
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
509509
}

pkg/ccl/sqlproxyccl/metrics.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -234,19 +234,19 @@ func makeProxyMetrics() metrics {
234234
RefusedConnCount: metric.NewCounter(metaRefusedConnCount),
235235
SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount),
236236
ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{
237-
Mode: metric.HistogramModePreferHdrLatency,
238-
Metadata: metaConnMigrationAttemptedCount,
239-
Duration: base.DefaultHistogramWindowInterval(),
240-
Buckets: metric.IOLatencyBuckets,
237+
Mode: metric.HistogramModePreferHdrLatency,
238+
Metadata: metaConnMigrationAttemptedCount,
239+
Duration: base.DefaultHistogramWindowInterval(),
240+
BucketConfig: metric.IOLatencyBuckets,
241241
}),
242242
AuthFailedCount: metric.NewCounter(metaAuthFailedCount),
243243
ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount),
244244
// Connector metrics.
245245
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
246-
Mode: metric.HistogramModePreferHdrLatency,
247-
Metadata: metaDialTenantLatency,
248-
Duration: base.DefaultHistogramWindowInterval(),
249-
Buckets: metric.IOLatencyBuckets},
246+
Mode: metric.HistogramModePreferHdrLatency,
247+
Metadata: metaDialTenantLatency,
248+
Duration: base.DefaultHistogramWindowInterval(),
249+
BucketConfig: metric.IOLatencyBuckets},
250250
),
251251
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
252252
// Connection migration metrics.
@@ -255,17 +255,17 @@ func makeProxyMetrics() metrics {
255255
ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount),
256256
ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount),
257257
ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{
258-
Mode: metric.HistogramModePreferHdrLatency,
259-
Metadata: metaConnMigrationAttemptedLatency,
260-
Duration: base.DefaultHistogramWindowInterval(),
261-
Buckets: metric.IOLatencyBuckets,
258+
Mode: metric.HistogramModePreferHdrLatency,
259+
Metadata: metaConnMigrationAttemptedLatency,
260+
Duration: base.DefaultHistogramWindowInterval(),
261+
BucketConfig: metric.IOLatencyBuckets,
262262
}),
263263
ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{
264-
Metadata: metaConnMigrationTransferResponseMessageSize,
265-
Duration: base.DefaultHistogramWindowInterval(),
266-
Buckets: metric.DataSize16MBBuckets,
267-
MaxVal: maxExpectedTransferResponseMessageSize,
268-
SigFigs: 1,
264+
Metadata: metaConnMigrationTransferResponseMessageSize,
265+
Duration: base.DefaultHistogramWindowInterval(),
266+
BucketConfig: metric.DataSize16MBBuckets,
267+
MaxVal: maxExpectedTransferResponseMessageSize,
268+
SigFigs: 1,
269269
}),
270270
QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire),
271271
QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP),

pkg/ccl/streamingccl/streamingest/metrics.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -171,25 +171,25 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
171171
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
172172
ReplanCount: metric.NewCounter(metaDistSQLReplanCount),
173173
FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{
174-
Metadata: metaReplicationFlushHistNanos,
175-
Duration: histogramWindow,
176-
Buckets: metric.BatchProcessLatencyBuckets,
177-
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
178-
SigFigs: 1,
174+
Metadata: metaReplicationFlushHistNanos,
175+
Duration: histogramWindow,
176+
BucketConfig: metric.BatchProcessLatencyBuckets,
177+
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
178+
SigFigs: 1,
179179
}),
180180
CommitLatency: metric.NewHistogram(metric.HistogramOptions{
181-
Metadata: metaReplicationCommitLatency,
182-
Duration: histogramWindow,
183-
Buckets: metric.BatchProcessLatencyBuckets,
184-
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
185-
SigFigs: 1,
181+
Metadata: metaReplicationCommitLatency,
182+
Duration: histogramWindow,
183+
BucketConfig: metric.BatchProcessLatencyBuckets,
184+
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
185+
SigFigs: 1,
186186
}),
187187
AdmitLatency: metric.NewHistogram(metric.HistogramOptions{
188-
Metadata: metaReplicationAdmitLatency,
189-
Duration: histogramWindow,
190-
Buckets: metric.BatchProcessLatencyBuckets,
191-
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
192-
SigFigs: 1,
188+
Metadata: metaReplicationAdmitLatency,
189+
Duration: histogramWindow,
190+
BucketConfig: metric.BatchProcessLatencyBuckets,
191+
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
192+
SigFigs: 1,
193193
}),
194194
RunningCount: metric.NewGauge(metaStreamsRunning),
195195
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),

pkg/kv/bulk/bulk_metrics.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ const log10int64times1000 = 19 * 1000
5252
func MakeBulkMetrics(histogramWindow time.Duration) Metrics {
5353
return Metrics{
5454
MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{
55-
Metadata: metaMemMaxBytes,
56-
Duration: histogramWindow,
57-
MaxVal: log10int64times1000,
58-
SigFigs: 3,
59-
Buckets: metric.MemoryUsage64MBBuckets,
55+
Metadata: metaMemMaxBytes,
56+
Duration: histogramWindow,
57+
MaxVal: log10int64times1000,
58+
SigFigs: 3,
59+
BucketConfig: metric.MemoryUsage64MBBuckets,
6060
}),
6161
CurBytesCount: metric.NewGauge(metaMemCurBytes),
6262
}

pkg/kv/kvclient/kvcoord/txn_metrics.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -285,20 +285,20 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
285285
ClientRefreshAutoRetries: metric.NewCounter(metaClientRefreshAutoRetries),
286286
ServerRefreshSuccess: metric.NewCounter(metaServerRefreshSuccess),
287287
Durations: metric.NewHistogram(metric.HistogramOptions{
288-
Mode: metric.HistogramModePreferHdrLatency,
289-
Metadata: metaDurationsHistograms,
290-
Duration: histogramWindow,
291-
Buckets: metric.IOLatencyBuckets,
288+
Mode: metric.HistogramModePreferHdrLatency,
289+
Metadata: metaDurationsHistograms,
290+
Duration: histogramWindow,
291+
BucketConfig: metric.IOLatencyBuckets,
292292
}),
293293
TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans),
294294
TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge),
295295
TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget),
296296
Restarts: metric.NewHistogram(metric.HistogramOptions{
297-
Metadata: metaRestartsHistogram,
298-
Duration: histogramWindow,
299-
MaxVal: 100,
300-
SigFigs: 3,
301-
Buckets: metric.Count1KBuckets,
297+
Metadata: metaRestartsHistogram,
298+
Duration: histogramWindow,
299+
MaxVal: 100,
300+
SigFigs: 3,
301+
BucketConfig: metric.Count1KBuckets,
302302
}),
303303
RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
304304
RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),

pkg/kv/kvprober/kvprober.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -276,18 +276,18 @@ func NewProber(opts Opts) *Prober {
276276
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
277277
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
278278
ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{
279-
Mode: metric.HistogramModePreferHdrLatency,
280-
Metadata: metaReadProbeLatency,
281-
Duration: opts.HistogramWindowInterval,
282-
Buckets: metric.IOLatencyBuckets,
279+
Mode: metric.HistogramModePreferHdrLatency,
280+
Metadata: metaReadProbeLatency,
281+
Duration: opts.HistogramWindowInterval,
282+
BucketConfig: metric.IOLatencyBuckets,
283283
}),
284284
WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts),
285285
WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures),
286286
WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{
287-
Mode: metric.HistogramModePreferHdrLatency,
288-
Metadata: metaWriteProbeLatency,
289-
Duration: opts.HistogramWindowInterval,
290-
Buckets: metric.IOLatencyBuckets,
287+
Mode: metric.HistogramModePreferHdrLatency,
288+
Metadata: metaWriteProbeLatency,
289+
Duration: opts.HistogramWindowInterval,
290+
BucketConfig: metric.IOLatencyBuckets,
291291
}),
292292
WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge(
293293
metaWriteProbeQuarantineOldestDuration,

pkg/kv/kvserver/client_manual_proposal_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,10 +232,10 @@ LIMIT
232232
Settings: st,
233233
Metrics: logstore.Metrics{
234234
RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{
235-
Mode: metric.HistogramModePrometheus,
236-
Metadata: fakeMeta,
237-
Duration: time.Millisecond,
238-
Buckets: metric.IOLatencyBuckets,
235+
Mode: metric.HistogramModePrometheus,
236+
Metadata: fakeMeta,
237+
Duration: time.Millisecond,
238+
BucketConfig: metric.IOLatencyBuckets,
239239
}),
240240
},
241241
}

pkg/kv/kvserver/kvflowcontrol/kvflowcontroller/kvflowcontroller_metrics.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,10 @@ func newMetrics(c *Controller) *metrics {
186186
)
187187
m.WaitDuration[wc] = metric.NewHistogram(
188188
metric.HistogramOptions{
189-
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
190-
Duration: base.DefaultHistogramWindowInterval(),
191-
Buckets: metric.IOLatencyBuckets,
192-
Mode: metric.HistogramModePrometheus,
189+
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
190+
Duration: base.DefaultHistogramWindowInterval(),
191+
BucketConfig: metric.IOLatencyBuckets,
192+
Mode: metric.HistogramModePrometheus,
193193
},
194194
)
195195
m.TotalStreamCount[wc] = metric.NewFunctionalGauge(

pkg/kv/kvserver/kvflowcontrol/kvflowhandle/kvflowhandle_metrics.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,10 @@ func NewMetrics(registry *metric.Registry) *Metrics {
109109
)
110110
m.WaitDuration[wc] = metric.NewHistogram(
111111
metric.HistogramOptions{
112-
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
113-
Duration: base.DefaultHistogramWindowInterval(),
114-
Buckets: metric.IOLatencyBuckets,
115-
Mode: metric.HistogramModePrometheus,
112+
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
113+
Duration: base.DefaultHistogramWindowInterval(),
114+
BucketConfig: metric.IOLatencyBuckets,
115+
Mode: metric.HistogramModePrometheus,
116116
},
117117
)
118118
}

0 commit comments

Comments
 (0)