Skip to content

Commit 006c443

Browse files
committed
kvserver: add storage.disk.{read,write}-max.iops
Fixes #150002 Epic: none Release note: None
1 parent f6e9d3f commit 006c443

File tree

2 files changed

+50
-4
lines changed

2 files changed

+50
-4
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16185,6 +16185,14 @@ layers:
1618516185
unit: BYTES
1618616186
aggregation: AVG
1618716187
derivative: NONE
16188+
- name: storage.disk.read-max.iops
16189+
exported_name: storage_disk_read_max_iops
16190+
description: Maximum rate of read operations performed on the disk (as reported by the OS)
16191+
y_axis_label: Operations
16192+
type: GAUGE
16193+
unit: COUNT
16194+
aggregation: AVG
16195+
derivative: NONE
1618816196
- name: storage.disk.read.bytes
1618916197
exported_name: storage_disk_read_bytes
1619016198
description: Bytes read from the store's disk since this process started (as reported by the OS)
@@ -16225,6 +16233,14 @@ layers:
1622516233
unit: BYTES
1622616234
aggregation: AVG
1622716235
derivative: NONE
16236+
- name: storage.disk.write-max.iops
16237+
exported_name: storage_disk_write_max_iops
16238+
description: Maximum rate of write operations performed on the disk (as reported by the OS)
16239+
y_axis_label: Operations
16240+
type: GAUGE
16241+
unit: COUNT
16242+
aggregation: AVG
16243+
derivative: NONE
1622816244
- name: storage.disk.write.bytes
1622916245
exported_name: storage_disk_write_bytes
1623016246
description: Bytes written to the store's disk since this process started (as reported by the OS)

pkg/kv/kvserver/metrics.go

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2761,8 +2761,19 @@ Note that the measurement does not include the duration for replicating the eval
27612761
Measurement: "Operations",
27622762
Help: "IO operations currently in progress on the store's disk (as reported by the OS)",
27632763
}
2764-
// The disk rate metrics are computed using data sampled on the interval,
2765-
// COCKROACH_DISK_STATS_POLLING_INTERVAL.
2764+
// The max disk rate metrics are computed using data sampled at
2765+
// DefaultDiskStatsPollingInterval, which defaults to 100ms, and scaled up
2766+
// to be a per-second rate. This is useful to observe short duration spikes
2767+
// which could result in throttling (and higher observed operation latency),
2768+
// that are not visible by computing the rate over the counter metrics that
2769+
// are sampled at the longer DefaultMetricsSampleInterval (10s).
2770+
//
2771+
// The expected usage is when a latency histogram, such as the fsync latency
2772+
// or disk read latency shows high tail latency, while the normal rate
2773+
// metrics show disk bandwidth and IOPS lower than the provisioned values.
2774+
// If these max rate metrics show usage close to the provisioned value, one
2775+
// can blame the high usage for the higher latency, and not blame it on
2776+
// unrelated slowness in the disk infrastructure.
27662777
metaDiskReadMaxBytesPerSecond = metric.Metadata{
27672778
Name: "storage.disk.read-max.bytespersecond",
27682779
Unit: metric.Unit_BYTES,
@@ -2775,6 +2786,18 @@ Note that the measurement does not include the duration for replicating the eval
27752786
Measurement: "Bytes",
27762787
Help: "Maximum rate at which bytes were written to disk (as reported by the OS)",
27772788
}
2789+
metaDiskReadMaxIOPS = metric.Metadata{
2790+
Name: "storage.disk.read-max.iops",
2791+
Unit: metric.Unit_COUNT,
2792+
Measurement: "Operations",
2793+
Help: "Maximum rate of read operations performed on the disk (as reported by the OS)",
2794+
}
2795+
metaDiskWriteMaxIOPS = metric.Metadata{
2796+
Name: "storage.disk.write-max.iops",
2797+
Unit: metric.Unit_COUNT,
2798+
Measurement: "Operations",
2799+
Help: "Maximum rate of write operations performed on the disk (as reported by the OS)",
2800+
}
27782801
)
27792802

27802803
// StoreMetrics is the set of metrics for a given store.
@@ -3212,6 +3235,8 @@ type StoreMetrics struct {
32123235
DiskIopsInProgress *metric.Gauge
32133236
DiskReadMaxBytesPerSecond *metric.Gauge
32143237
DiskWriteMaxBytesPerSecond *metric.Gauge
3238+
DiskReadMaxIOPS *metric.Gauge
3239+
DiskWriteMaxIOPS *metric.Gauge
32153240
}
32163241

32173242
// TenantsStorageMetrics are metrics which are aggregated over all tenants
@@ -3990,6 +4015,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
39904015
DiskIopsInProgress: metric.NewGauge(metaDiskIopsInProgress),
39914016
DiskReadMaxBytesPerSecond: metric.NewGauge(metaDiskReadMaxBytesPerSecond),
39924017
DiskWriteMaxBytesPerSecond: metric.NewGauge(metaDiskWriteMaxBytesPerSecond),
4018+
DiskReadMaxIOPS: metric.NewGauge(metaDiskReadMaxIOPS),
4019+
DiskWriteMaxIOPS: metric.NewGauge(metaDiskWriteMaxIOPS),
39934020

39944021
// Estimated MVCC stats in split.
39954022
SplitsWithEstimatedStats: metric.NewCounter(metaSplitEstimatedStats),
@@ -4262,11 +4289,14 @@ func (sm *StoreMetrics) updateDiskStats(
42624289
log.Errorf(ctx, "not updating cumulative stats due to %s", cumulativeStatsErr)
42634290
}
42644291
maxRollingStats := rollingStats.Max()
4265-
// maxRollingStats is computed as the change in stats every 100ms, so we
4266-
// scale them to represent the change in stats every 1s.
4292+
// maxRollingStats is computed as the change in stats every 100ms
4293+
// (DefaultDiskStatsPollingInterval), so we scale them to represent the
4294+
// change in stats every 1s.
42674295
perSecondMultiplier := int(time.Second / disk.DefaultDiskStatsPollingInterval)
42684296
sm.DiskReadMaxBytesPerSecond.Update(int64(maxRollingStats.BytesRead() * perSecondMultiplier))
42694297
sm.DiskWriteMaxBytesPerSecond.Update(int64(maxRollingStats.BytesWritten() * perSecondMultiplier))
4298+
sm.DiskReadMaxIOPS.Update(int64(maxRollingStats.ReadsCount * perSecondMultiplier))
4299+
sm.DiskWriteMaxIOPS.Update(int64(maxRollingStats.WritesCount * perSecondMultiplier))
42704300
}
42714301

42724302
func (sm *StoreMetrics) handleMetricsResult(ctx context.Context, metric result.Metrics) {

0 commit comments

Comments
 (0)