Skip to content

Commit 3834ff7

Browse files
committed
admission,storage: increase overload score of store with disk slowness
So that the allocator accounts for it in shedding leases etc. The unhealthy duration is also exported as a metric. Informs #153280 Epic: none Release note (ops change): The cluster setting storage.unhealthy_write_duration (defaults to 20s), is used to indicate to the allocator that a store's disk is unhealthy. The cluster setting kv.allocator.disk_unhealthy_io_overload_score controls the overload score assigned to a store with an unhealthy disk, where a higher score results in preventing lease or replica transfers to the store, or shedding of leases by the store. The default value of that setting is 0, so the allocator behavior is unaffected.
1 parent 1b92a23 commit 3834ff7

File tree

17 files changed

+454
-32
lines changed

17 files changed

+454
-32
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16351,6 +16351,14 @@ layers:
1635116351
unit: COUNT
1635216352
aggregation: AVG
1635316353
derivative: NON_NEGATIVE_DERIVATIVE
16354+
- name: storage.disk-unhealthy.duration
16355+
exported_name: storage_disk_unhealthy_duration
16356+
description: Total disk unhealthy duration in nanos
16357+
y_axis_label: Nanoseconds
16358+
type: COUNTER
16359+
unit: NANOSECONDS
16360+
aggregation: AVG
16361+
derivative: NON_NEGATIVE_DERIVATIVE
1635416362
- name: storage.disk.io.time
1635516363
exported_name: storage_disk_io_time
1635616364
description: Time spent reading from or writing to the store's disk since this process started (as reported by the OS)

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@
367367
<tr><td><div id="setting-storage-sstable-compression-algorithm" class="anchored"><code>storage.sstable.compression_algorithm</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for use in a Pebble store (balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, balanced = 6, good = 7]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
368368
<tr><td><div id="setting-storage-sstable-compression-algorithm-backup-storage" class="anchored"><code>storage.sstable.compression_algorithm_backup_storage</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup row data storage (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
369369
<tr><td><div id="setting-storage-sstable-compression-algorithm-backup-transport" class="anchored"><code>storage.sstable.compression_algorithm_backup_transport</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup transport (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
370+
<tr><td><div id="setting-storage-unhealthy-write-duration" class="anchored"><code>storage.unhealthy_write_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>duration for disk write operations, beyond which the disk will be reported as unhealthy for higher layer actions</td><td>Advanced/Self-Hosted</td></tr>
370371
<tr><td><div id="setting-storage-wal-failover-unhealthy-op-threshold" class="anchored"><code>storage.wal_failover.unhealthy_op_threshold</code></div></td><td>duration</td><td><code>100ms</code></td><td>the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location</td><td>Advanced/Self-Hosted</td></tr>
371372
<tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Advanced/Self-Hosted</td></tr>
372373
<tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>

pkg/kv/kvserver/allocator/allocatorimpl/allocator.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2358,6 +2358,7 @@ func (a *Allocator) IOOverloadOptions() IOOverloadOptions {
23582358
ReplicaIOOverloadThreshold: ReplicaIOOverloadThreshold.Get(&a.st.SV),
23592359
LeaseIOOverloadThreshold: LeaseIOOverloadThreshold.Get(&a.st.SV),
23602360
LeaseIOOverloadShedThreshold: LeaseIOOverloadShedThreshold.Get(&a.st.SV),
2361+
DiskUnhealthyScore: DiskUnhealthyIOOverloadScore.Get(&a.st.SV),
23612362
}
23622363
}
23632364

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,20 @@ var LeaseIOOverloadThresholdEnforcement = settings.RegisterEnumSetting(
229229
},
230230
)
231231

232+
// DiskUnhealthyIOOverloadScore is the IO overload score assigned to a store
233+
// when its disk is considered unhealthy. The value set here will be used in
234+
// the context of {LeaseIOOverloadThreshold, LeaseIOOverloadShedThreshold,
235+
// LeaseIOOverloadThresholdEnforcement} to shed leases, and in the context of
236+
// {ReplicaIOOverloadThreshold, ReplicaIOOverloadThresholdEnforcement}, when
237+
// transferring replicas.
238+
//
239+
// TODO(sumeer): change to DefaultLeaseIOOverloadShedThreshold after discussion.
240+
var DiskUnhealthyIOOverloadScore = settings.RegisterFloatSetting(
241+
settings.SystemOnly,
242+
"kv.allocator.disk_unhealthy_io_overload_score",
243+
"the IO overload score to assign to a store when its disk is unhealthy",
244+
0)
245+
232246
// maxDiskUtilizationThreshold controls the point at which the store cedes
233247
// having room for new replicas. If the fraction used of a store descriptor
234248
// capacity is greater than this value, it will never be used as a rebalance or
@@ -2466,42 +2480,55 @@ type IOOverloadOptions struct {
24662480
ReplicaIOOverloadThreshold float64
24672481
LeaseIOOverloadThreshold float64
24682482
LeaseIOOverloadShedThreshold float64
2483+
2484+
DiskUnhealthyScore float64
24692485
}
24702486

24712487
func ioOverloadCheck(
2472-
score, avg, absThreshold, meanThreshold float64,
2488+
overloadScore, overloadAvg, diskUnhealthyScore, absThreshold, meanThreshold float64,
24732489
enforcement IOOverloadEnforcementLevel,
24742490
disallowed ...IOOverloadEnforcementLevel,
24752491
) (ok bool, reason string) {
2476-
absCheck := score < absThreshold
2477-
meanCheck := score < avg*meanThreshold
2492+
absCheck := overloadScore < absThreshold
2493+
meanCheck := overloadScore < overloadAvg*meanThreshold
2494+
// We do not bother with the mean for the disk unhealthy score, because disk
2495+
// unhealthiness is rare, and also because this code was bolted on later
2496+
// (and was simpler to ignore the mean).
2497+
//
2498+
// TODO(sumeer): revisit this if this turns out to be useful, and do a
2499+
// cleaner version for MMA.
2500+
diskCheck := diskUnhealthyScore < epsilon || diskUnhealthyScore < absThreshold
24782501

24792502
// The score needs to be no less than both the average threshold and the
24802503
// absolute threshold in order to be considered IO overloaded.
2481-
if absCheck || meanCheck {
2504+
if (absCheck || meanCheck) && diskCheck {
24822505
return true, ""
24832506
}
24842507

24852508
for _, disallowedEnforcement := range disallowed {
24862509
if enforcement == disallowedEnforcement {
24872510
return false, fmt.Sprintf(
2488-
"io overload %.2f exceeds threshold %.2f, above average: %.2f, enforcement %d",
2489-
score, absThreshold, avg, enforcement)
2511+
"io overload %.2f (disk %.2f) exceeds threshold %.2f, above average: %.2f, enforcement %d",
2512+
overloadScore, diskUnhealthyScore, absThreshold, overloadAvg, enforcement)
24902513
}
24912514
}
24922515

24932516
return true, ""
24942517
}
24952518

2496-
func (o IOOverloadOptions) storeScore(store roachpb.StoreDescriptor) float64 {
2497-
var score float64
2519+
func (o IOOverloadOptions) storeScore(
2520+
store roachpb.StoreDescriptor,
2521+
) (overloadScore float64, diskUnhealthyScore float64) {
24982522
if o.UseIOThresholdMax {
2499-
score, _ = store.Capacity.IOThresholdMax.Score()
2523+
overloadScore, _ = store.Capacity.IOThresholdMax.Score()
25002524
} else {
2501-
score, _ = store.Capacity.IOThreshold.Score()
2525+
overloadScore, _ = store.Capacity.IOThreshold.Score()
25022526
}
2503-
2504-
return score
2527+
diskUnhealthyScore = 0
2528+
if store.Capacity.IOThreshold.DiskUnhealthy {
2529+
diskUnhealthyScore = o.DiskUnhealthyScore
2530+
}
2531+
return overloadScore, diskUnhealthyScore
25052532
}
25062533

25072534
func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) float64 {
@@ -2517,10 +2544,10 @@ func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) floa
25172544
func (o IOOverloadOptions) allocateReplicaToCheck(
25182545
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25192546
) bool {
2520-
score := o.storeScore(store)
2547+
score, diskUnhealthyScore := o.storeScore(store)
25212548
avg := o.storeListAvgScore(storeList)
25222549

2523-
if ok, reason := ioOverloadCheck(score, avg,
2550+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25242551
o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
25252552
o.ReplicaEnforcementLevel,
25262553
IOOverloadThresholdBlockAll,
@@ -2538,10 +2565,10 @@ func (o IOOverloadOptions) allocateReplicaToCheck(
25382565
func (o IOOverloadOptions) rebalanceReplicaToCheck(
25392566
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25402567
) bool {
2541-
score := o.storeScore(store)
2568+
score, diskUnhealthyScore := o.storeScore(store)
25422569
avg := o.storeListAvgScore(storeList)
25432570

2544-
if ok, reason := ioOverloadCheck(score, avg,
2571+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25452572
o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
25462573
o.ReplicaEnforcementLevel,
25472574
IOOverloadThresholdBlockTransfers, IOOverloadThresholdBlockAll,
@@ -2558,10 +2585,10 @@ func (o IOOverloadOptions) rebalanceReplicaToCheck(
25582585
func (o IOOverloadOptions) transferLeaseToCheck(
25592586
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25602587
) bool {
2561-
score := o.storeScore(store)
2588+
score, diskUnhealthyScore := o.storeScore(store)
25622589
avg := o.storeListAvgScore(storeList)
25632590

2564-
if ok, reason := ioOverloadCheck(score, avg,
2591+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25652592
o.LeaseIOOverloadThreshold, IOOverloadMeanThreshold,
25662593
o.LeaseEnforcementLevel,
25672594
IOOverloadThresholdBlockTransfers, IOOverloadThresholdShed,
@@ -2579,10 +2606,10 @@ func (o IOOverloadOptions) transferLeaseToCheck(
25792606
func (o IOOverloadOptions) ExistingLeaseCheck(
25802607
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25812608
) bool {
2582-
score := o.storeScore(store)
2609+
score, diskUnhealthyScore := o.storeScore(store)
25832610
avg := o.storeListAvgScore(storeList)
25842611

2585-
if ok, reason := ioOverloadCheck(score, avg,
2612+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25862613
o.LeaseIOOverloadShedThreshold, IOOverloadMeanShedThreshold,
25872614
o.LeaseEnforcementLevel,
25882615
IOOverloadThresholdShed,

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,3 +2179,45 @@ func TestCandidateListString(t *testing.T) {
21792179
"s3, valid:false, fulldisk:false, necessary:false, voterNecessary:false, diversity:1.00, ioOverloaded: false, ioOverload: 1.00, converges:-1, balance:-1, hasNonVoter:false, rangeCount:3, queriesPerSecond:0.00, details:(mock detail 3)]",
21802180
cl.String())
21812181
}
2182+
2183+
func TestIOOverloadOptionsDiskUnhealthy(t *testing.T) {
2184+
defer leaktest.AfterTest(t)()
2185+
defer log.Scope(t).Close(t)
2186+
2187+
ctx := context.Background()
2188+
options := IOOverloadOptions{
2189+
UseIOThresholdMax: false,
2190+
ReplicaIOOverloadThreshold: 0.2,
2191+
LeaseIOOverloadThreshold: 0.2,
2192+
LeaseIOOverloadShedThreshold: 0.2,
2193+
DiskUnhealthyScore: 0.3,
2194+
}
2195+
2196+
store := roachpb.StoreDescriptor{}
2197+
o := options
2198+
o.ReplicaEnforcementLevel = IOOverloadThresholdBlockAll
2199+
require.True(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
2200+
store.Capacity.IOThreshold.DiskUnhealthy = true
2201+
require.False(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
2202+
2203+
store = roachpb.StoreDescriptor{}
2204+
o = options
2205+
o.ReplicaEnforcementLevel = IOOverloadThresholdBlockTransfers
2206+
require.True(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
2207+
store.Capacity.IOThreshold.DiskUnhealthy = true
2208+
require.False(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
2209+
2210+
store = roachpb.StoreDescriptor{}
2211+
o = options
2212+
o.LeaseEnforcementLevel = IOOverloadThresholdShed
2213+
require.True(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
2214+
store.Capacity.IOThreshold.DiskUnhealthy = true
2215+
require.False(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
2216+
2217+
store = roachpb.StoreDescriptor{}
2218+
o = options
2219+
o.LeaseEnforcementLevel = IOOverloadThresholdBlockTransfers
2220+
require.True(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
2221+
store.Capacity.IOThreshold.DiskUnhealthy = true
2222+
require.False(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
2223+
}

pkg/kv/kvserver/metrics.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,7 +1247,12 @@ var (
12471247
Measurement: "Events",
12481248
Unit: metric.Unit_COUNT,
12491249
}
1250-
1250+
metaDiskUnhealthyDuration = metric.Metadata{
1251+
Name: "storage.disk-unhealthy.duration",
1252+
Help: "Total disk unhealthy duration in nanos",
1253+
Measurement: "Nanoseconds",
1254+
Unit: metric.Unit_NANOSECONDS,
1255+
}
12511256
// Range event metrics.
12521257
metaRangeSplits = metric.Metadata{
12531258
Name: "range.splits",
@@ -3111,8 +3116,9 @@ type StoreMetrics struct {
31113116
RdbCheckpoints *metric.Gauge
31123117

31133118
// Disk health metrics.
3114-
DiskSlow *metric.Counter
3115-
DiskStalled *metric.Counter
3119+
DiskSlow *metric.Counter
3120+
DiskStalled *metric.Counter
3121+
DiskUnhealthyDuration *metric.Counter
31163122

31173123
// TODO(mrtracy): This should be removed as part of #4465. This is only
31183124
// maintained to keep the current structure of NodeStatus; it would be
@@ -3868,8 +3874,9 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
38683874
RdbCheckpoints: metric.NewGauge(metaRdbCheckpoints),
38693875

38703876
// Disk health metrics.
3871-
DiskSlow: metric.NewCounter(metaDiskSlow),
3872-
DiskStalled: metric.NewCounter(metaDiskStalled),
3877+
DiskSlow: metric.NewCounter(metaDiskSlow),
3878+
DiskStalled: metric.NewCounter(metaDiskStalled),
3879+
DiskUnhealthyDuration: metric.NewCounter(metaDiskUnhealthyDuration),
38733880

38743881
// Range event metrics.
38753882
RangeSplits: metric.NewCounter(metaRangeSplits),
@@ -4260,6 +4267,7 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
42604267
sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
42614268
sm.DiskSlow.Update(m.DiskSlowCount)
42624269
sm.DiskStalled.Update(m.DiskStallCount)
4270+
sm.DiskUnhealthyDuration.Update(int64(m.DiskUnhealthyDuration))
42634271
sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
42644272
sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
42654273
sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))

pkg/kv/kvserver/store_gossip.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
522522
lastGossipMaxIOScore, _ := s.cachedCapacity.lastGossiped.IOThresholdMax.Score()
523523
updateForMaxIOOverloadScore := cachedMaxIOScore >= gossipMinMaxIOOverloadScore &&
524524
cachedMaxIOScore > lastGossipMaxIOScore
525+
diskUnhealthy := s.cachedCapacity.cached.IOThreshold.DiskUnhealthy
526+
updateForChangeInDiskUnhealth :=
527+
s.cachedCapacity.lastGossiped.IOThreshold.DiskUnhealthy != diskUnhealthy
525528
s.cachedCapacity.Unlock()
526529

527530
if s.knobs.DisableLeaseCapacityGossip {
@@ -549,6 +552,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
549552
if updateForMaxIOOverloadScore {
550553
reason += fmt.Sprintf("io-overload(%.1f) ", cachedMaxIOScore)
551554
}
555+
if updateForChangeInDiskUnhealth {
556+
reason += fmt.Sprintf("disk-unhealthy(%t) ", diskUnhealthy)
557+
}
552558
if reason != "" {
553559
should = true
554560
reason += "change"

pkg/roachprod/opentelemetry/cockroachdb_metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1924,6 +1924,7 @@ var cockroachdbMetrics = map[string]string{
19241924
"storage_disk_read_time": "storage.disk.read.time",
19251925
"storage_disk_slow": "storage.disk_slow",
19261926
"storage_disk_stalled": "storage.disk_stalled",
1927+
"storage_disk_unhealthy_duration": "storage.disk-unhealthy.duration",
19271928
"storage_disk_weightedio_time": "storage.disk.weightedio.time",
19281929
"storage_disk_write_bytes": "storage.disk.write.bytes",
19291930
"storage_disk_write_count": "storage.disk.write.count",

pkg/server/node.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,10 +1374,12 @@ func (pmp *nodePebbleMetricsProvider) GetPebbleMetrics() []admission.StoreMetric
13741374
if s, ok := storeIDToDiskStats[store.StoreID()]; ok {
13751375
diskStats = s
13761376
}
1377+
diskUnhealthy := eng.GetDiskUnhealthy()
13771378
metrics = append(metrics, admission.StoreMetrics{
13781379
StoreID: store.StoreID(),
13791380
Metrics: m.Metrics,
13801381
WriteStallCount: m.WriteStallCount,
1382+
DiskUnhealthy: diskUnhealthy,
13811383
DiskStats: diskStats,
13821384
MemTableSizeForStopWrites: memTableSizeForStopWrites,
13831385
})

pkg/storage/engine.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,17 @@ type Engine interface {
11341134
// GetPebbleOptions returns the options used when creating the engine. The
11351135
// caller must not modify these.
11361136
GetPebbleOptions() *pebble.Options
1137+
1138+
// GetDiskUnhealthy returns true if the engine has determined that the
1139+
// underlying disk is transiently unhealthy. This can change from false to
1140+
// true and back to false. The engine has mechanisms to mask disk unhealth
1141+
// (e.g. if WAL failover is configured), but in some cases the unhealth is
1142+
// longer than what the engine may be able to successfully mask, but not yet
1143+
// long enough to crash the node (see
1144+
// COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT). This method returns true in
1145+
// this intermediate case. Currently, this mainly feeds into allocation
1146+
// decisions by the caller (such as shedding leases).
1147+
GetDiskUnhealthy() bool
11371148
}
11381149

11391150
// Batch is the interface for batch specific operations.
@@ -1250,6 +1261,9 @@ type Metrics struct {
12501261
// distinguished in the pebble logs.
12511262
WriteStallCount int64
12521263
WriteStallDuration time.Duration
1264+
// DiskUnhealthyDuration is the duration for which Engine.GetUnhealthyDisk
1265+
// has returned true.
1266+
DiskUnhealthyDuration time.Duration
12531267

12541268
// BlockLoadConcurrencyLimit is the current limit on the number of concurrent
12551269
// sstable block reads.

0 commit comments

Comments
 (0)