Skip to content

Commit da44d30

Browse files
authored
Merge pull request #154459 from cockroachdb/blathers/backport-release-25.4-153364
release-25.4: admission,storage: increase overload score of store with disk slowness
2 parents 52db258 + 37b913f commit da44d30

File tree

17 files changed

+454
-32
lines changed

17 files changed

+454
-32
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16367,6 +16367,14 @@ layers:
1636716367
unit: COUNT
1636816368
aggregation: AVG
1636916369
derivative: NON_NEGATIVE_DERIVATIVE
16370+
- name: storage.disk-unhealthy.duration
16371+
exported_name: storage_disk_unhealthy_duration
16372+
description: Total disk unhealthy duration in nanos
16373+
y_axis_label: Nanoseconds
16374+
type: COUNTER
16375+
unit: NANOSECONDS
16376+
aggregation: AVG
16377+
derivative: NON_NEGATIVE_DERIVATIVE
1637016378
- name: storage.disk.io.time
1637116379
exported_name: storage_disk_io_time
1637216380
description: Time spent reading from or writing to the store's disk since this process started (as reported by the OS)

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@
367367
<tr><td><div id="setting-storage-sstable-compression-algorithm" class="anchored"><code>storage.sstable.compression_algorithm</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for use in a Pebble store (balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, balanced = 6, good = 7]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
368368
<tr><td><div id="setting-storage-sstable-compression-algorithm-backup-storage" class="anchored"><code>storage.sstable.compression_algorithm_backup_storage</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup row data storage (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
369369
<tr><td><div id="setting-storage-sstable-compression-algorithm-backup-transport" class="anchored"><code>storage.sstable.compression_algorithm_backup_transport</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup transport (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
370+
<tr><td><div id="setting-storage-unhealthy-write-duration" class="anchored"><code>storage.unhealthy_write_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>duration for disk write operations, beyond which the disk will be reported as unhealthy for higher layer actions</td><td>Advanced/Self-Hosted</td></tr>
370371
<tr><td><div id="setting-storage-wal-failover-unhealthy-op-threshold" class="anchored"><code>storage.wal_failover.unhealthy_op_threshold</code></div></td><td>duration</td><td><code>100ms</code></td><td>the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location</td><td>Advanced/Self-Hosted</td></tr>
371372
<tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Advanced/Self-Hosted</td></tr>
372373
<tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>

pkg/kv/kvserver/allocator/allocatorimpl/allocator.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2358,6 +2358,7 @@ func (a *Allocator) IOOverloadOptions() IOOverloadOptions {
23582358
ReplicaIOOverloadThreshold: ReplicaIOOverloadThreshold.Get(&a.st.SV),
23592359
LeaseIOOverloadThreshold: LeaseIOOverloadThreshold.Get(&a.st.SV),
23602360
LeaseIOOverloadShedThreshold: LeaseIOOverloadShedThreshold.Get(&a.st.SV),
2361+
DiskUnhealthyScore: DiskUnhealthyIOOverloadScore.Get(&a.st.SV),
23612362
}
23622363
}
23632364

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,20 @@ var LeaseIOOverloadThresholdEnforcement = settings.RegisterEnumSetting(
229229
},
230230
)
231231

232+
// DiskUnhealthyIOOverloadScore is the IO overload score assigned to a store
233+
// when its disk is considered unhealthy. The value set here will be used in
234+
// the context of {LeaseIOOverloadThreshold, LeaseIOOverloadShedThreshold,
235+
// LeaseIOOverloadThresholdEnforcement} to shed leases, and in the context of
236+
// {ReplicaIOOverloadThreshold, ReplicaIOOverloadThresholdEnforcement}, when
237+
// transferring replicas.
238+
//
239+
// TODO(sumeer): change to DefaultLeaseIOOverloadShedThreshold after discussion.
240+
var DiskUnhealthyIOOverloadScore = settings.RegisterFloatSetting(
241+
settings.SystemOnly,
242+
"kv.allocator.disk_unhealthy_io_overload_score",
243+
"the IO overload score to assign to a store when its disk is unhealthy",
244+
0)
245+
232246
// maxDiskUtilizationThreshold controls the point at which the store cedes
233247
// having room for new replicas. If the fraction used of a store descriptor
234248
// capacity is greater than this value, it will never be used as a rebalance or
@@ -2466,42 +2480,55 @@ type IOOverloadOptions struct {
24662480
ReplicaIOOverloadThreshold float64
24672481
LeaseIOOverloadThreshold float64
24682482
LeaseIOOverloadShedThreshold float64
2483+
2484+
DiskUnhealthyScore float64
24692485
}
24702486

24712487
func ioOverloadCheck(
2472-
score, avg, absThreshold, meanThreshold float64,
2488+
overloadScore, overloadAvg, diskUnhealthyScore, absThreshold, meanThreshold float64,
24732489
enforcement IOOverloadEnforcementLevel,
24742490
disallowed ...IOOverloadEnforcementLevel,
24752491
) (ok bool, reason string) {
2476-
absCheck := score < absThreshold
2477-
meanCheck := score < avg*meanThreshold
2492+
absCheck := overloadScore < absThreshold
2493+
meanCheck := overloadScore < overloadAvg*meanThreshold
2494+
// We do not bother with the mean for the disk unhealthy score, because disk
2495+
// unhealthiness is rare, and also because this code was bolted on later
2496+
// (and was simpler to ignore the mean).
2497+
//
2498+
// TODO(sumeer): revisit this if this turns out to be useful, and do a
2499+
// cleaner version for MMA.
2500+
diskCheck := diskUnhealthyScore < epsilon || diskUnhealthyScore < absThreshold
24782501

24792502
// The score needs to be no less than both the average threshold and the
24802503
// absolute threshold in order to be considered IO overloaded.
2481-
if absCheck || meanCheck {
2504+
if (absCheck || meanCheck) && diskCheck {
24822505
return true, ""
24832506
}
24842507

24852508
for _, disallowedEnforcement := range disallowed {
24862509
if enforcement == disallowedEnforcement {
24872510
return false, fmt.Sprintf(
2488-
"io overload %.2f exceeds threshold %.2f, above average: %.2f, enforcement %d",
2489-
score, absThreshold, avg, enforcement)
2511+
"io overload %.2f (disk %.2f) exceeds threshold %.2f, above average: %.2f, enforcement %d",
2512+
overloadScore, diskUnhealthyScore, absThreshold, overloadAvg, enforcement)
24902513
}
24912514
}
24922515

24932516
return true, ""
24942517
}
24952518

2496-
func (o IOOverloadOptions) storeScore(store roachpb.StoreDescriptor) float64 {
2497-
var score float64
2519+
func (o IOOverloadOptions) storeScore(
2520+
store roachpb.StoreDescriptor,
2521+
) (overloadScore float64, diskUnhealthyScore float64) {
24982522
if o.UseIOThresholdMax {
2499-
score, _ = store.Capacity.IOThresholdMax.Score()
2523+
overloadScore, _ = store.Capacity.IOThresholdMax.Score()
25002524
} else {
2501-
score, _ = store.Capacity.IOThreshold.Score()
2525+
overloadScore, _ = store.Capacity.IOThreshold.Score()
25022526
}
2503-
2504-
return score
2527+
diskUnhealthyScore = 0
2528+
if store.Capacity.IOThreshold.DiskUnhealthy {
2529+
diskUnhealthyScore = o.DiskUnhealthyScore
2530+
}
2531+
return overloadScore, diskUnhealthyScore
25052532
}
25062533

25072534
func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) float64 {
@@ -2517,10 +2544,10 @@ func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) floa
25172544
func (o IOOverloadOptions) allocateReplicaToCheck(
25182545
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25192546
) bool {
2520-
score := o.storeScore(store)
2547+
score, diskUnhealthyScore := o.storeScore(store)
25212548
avg := o.storeListAvgScore(storeList)
25222549

2523-
if ok, reason := ioOverloadCheck(score, avg,
2550+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25242551
o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
25252552
o.ReplicaEnforcementLevel,
25262553
IOOverloadThresholdBlockAll,
@@ -2538,10 +2565,10 @@ func (o IOOverloadOptions) allocateReplicaToCheck(
25382565
func (o IOOverloadOptions) rebalanceReplicaToCheck(
25392566
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25402567
) bool {
2541-
score := o.storeScore(store)
2568+
score, diskUnhealthyScore := o.storeScore(store)
25422569
avg := o.storeListAvgScore(storeList)
25432570

2544-
if ok, reason := ioOverloadCheck(score, avg,
2571+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25452572
o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
25462573
o.ReplicaEnforcementLevel,
25472574
IOOverloadThresholdBlockTransfers, IOOverloadThresholdBlockAll,
@@ -2558,10 +2585,10 @@ func (o IOOverloadOptions) rebalanceReplicaToCheck(
25582585
func (o IOOverloadOptions) transferLeaseToCheck(
25592586
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25602587
) bool {
2561-
score := o.storeScore(store)
2588+
score, diskUnhealthyScore := o.storeScore(store)
25622589
avg := o.storeListAvgScore(storeList)
25632590

2564-
if ok, reason := ioOverloadCheck(score, avg,
2591+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25652592
o.LeaseIOOverloadThreshold, IOOverloadMeanThreshold,
25662593
o.LeaseEnforcementLevel,
25672594
IOOverloadThresholdBlockTransfers, IOOverloadThresholdShed,
@@ -2579,10 +2606,10 @@ func (o IOOverloadOptions) transferLeaseToCheck(
25792606
func (o IOOverloadOptions) ExistingLeaseCheck(
25802607
ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
25812608
) bool {
2582-
score := o.storeScore(store)
2609+
score, diskUnhealthyScore := o.storeScore(store)
25832610
avg := o.storeListAvgScore(storeList)
25842611

2585-
if ok, reason := ioOverloadCheck(score, avg,
2612+
if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
25862613
o.LeaseIOOverloadShedThreshold, IOOverloadMeanShedThreshold,
25872614
o.LeaseEnforcementLevel,
25882615
IOOverloadThresholdShed,

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,3 +2179,45 @@ func TestCandidateListString(t *testing.T) {
21792179
"s3, valid:false, fulldisk:false, necessary:false, voterNecessary:false, diversity:1.00, ioOverloaded: false, ioOverload: 1.00, converges:-1, balance:-1, hasNonVoter:false, rangeCount:3, queriesPerSecond:0.00, details:(mock detail 3)]",
21802180
cl.String())
21812181
}
2182+
2183+
func TestIOOverloadOptionsDiskUnhealthy(t *testing.T) {
2184+
defer leaktest.AfterTest(t)()
2185+
defer log.Scope(t).Close(t)
2186+
2187+
ctx := context.Background()
2188+
options := IOOverloadOptions{
2189+
UseIOThresholdMax: false,
2190+
ReplicaIOOverloadThreshold: 0.2,
2191+
LeaseIOOverloadThreshold: 0.2,
2192+
LeaseIOOverloadShedThreshold: 0.2,
2193+
DiskUnhealthyScore: 0.3,
2194+
}
2195+
2196+
store := roachpb.StoreDescriptor{}
2197+
o := options
2198+
o.ReplicaEnforcementLevel = IOOverloadThresholdBlockAll
2199+
require.True(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
2200+
store.Capacity.IOThreshold.DiskUnhealthy = true
2201+
require.False(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
2202+
2203+
store = roachpb.StoreDescriptor{}
2204+
o = options
2205+
o.ReplicaEnforcementLevel = IOOverloadThresholdBlockTransfers
2206+
require.True(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
2207+
store.Capacity.IOThreshold.DiskUnhealthy = true
2208+
require.False(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
2209+
2210+
store = roachpb.StoreDescriptor{}
2211+
o = options
2212+
o.LeaseEnforcementLevel = IOOverloadThresholdShed
2213+
require.True(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
2214+
store.Capacity.IOThreshold.DiskUnhealthy = true
2215+
require.False(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
2216+
2217+
store = roachpb.StoreDescriptor{}
2218+
o = options
2219+
o.LeaseEnforcementLevel = IOOverloadThresholdBlockTransfers
2220+
require.True(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
2221+
store.Capacity.IOThreshold.DiskUnhealthy = true
2222+
require.False(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
2223+
}

pkg/kv/kvserver/metrics.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,7 +1247,12 @@ var (
12471247
Measurement: "Events",
12481248
Unit: metric.Unit_COUNT,
12491249
}
1250-
1250+
metaDiskUnhealthyDuration = metric.Metadata{
1251+
Name: "storage.disk-unhealthy.duration",
1252+
Help: "Total disk unhealthy duration in nanos",
1253+
Measurement: "Nanoseconds",
1254+
Unit: metric.Unit_NANOSECONDS,
1255+
}
12511256
// Range event metrics.
12521257
metaRangeSplits = metric.Metadata{
12531258
Name: "range.splits",
@@ -3111,8 +3116,9 @@ type StoreMetrics struct {
31113116
RdbCheckpoints *metric.Gauge
31123117

31133118
// Disk health metrics.
3114-
DiskSlow *metric.Counter
3115-
DiskStalled *metric.Counter
3119+
DiskSlow *metric.Counter
3120+
DiskStalled *metric.Counter
3121+
DiskUnhealthyDuration *metric.Counter
31163122

31173123
// TODO(mrtracy): This should be removed as part of #4465. This is only
31183124
// maintained to keep the current structure of NodeStatus; it would be
@@ -3868,8 +3874,9 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
38683874
RdbCheckpoints: metric.NewGauge(metaRdbCheckpoints),
38693875

38703876
// Disk health metrics.
3871-
DiskSlow: metric.NewCounter(metaDiskSlow),
3872-
DiskStalled: metric.NewCounter(metaDiskStalled),
3877+
DiskSlow: metric.NewCounter(metaDiskSlow),
3878+
DiskStalled: metric.NewCounter(metaDiskStalled),
3879+
DiskUnhealthyDuration: metric.NewCounter(metaDiskUnhealthyDuration),
38733880

38743881
// Range event metrics.
38753882
RangeSplits: metric.NewCounter(metaRangeSplits),
@@ -4261,6 +4268,7 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
42614268
sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
42624269
sm.DiskSlow.Update(m.DiskSlowCount)
42634270
sm.DiskStalled.Update(m.DiskStallCount)
4271+
sm.DiskUnhealthyDuration.Update(int64(m.DiskUnhealthyDuration))
42644272
sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
42654273
sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
42664274
sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))

pkg/kv/kvserver/store_gossip.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
522522
lastGossipMaxIOScore, _ := s.cachedCapacity.lastGossiped.IOThresholdMax.Score()
523523
updateForMaxIOOverloadScore := cachedMaxIOScore >= gossipMinMaxIOOverloadScore &&
524524
cachedMaxIOScore > lastGossipMaxIOScore
525+
diskUnhealthy := s.cachedCapacity.cached.IOThreshold.DiskUnhealthy
526+
updateForChangeInDiskUnhealth :=
527+
s.cachedCapacity.lastGossiped.IOThreshold.DiskUnhealthy != diskUnhealthy
525528
s.cachedCapacity.Unlock()
526529

527530
if s.knobs.DisableLeaseCapacityGossip {
@@ -549,6 +552,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
549552
if updateForMaxIOOverloadScore {
550553
reason += fmt.Sprintf("io-overload(%.1f) ", cachedMaxIOScore)
551554
}
555+
if updateForChangeInDiskUnhealth {
556+
reason += fmt.Sprintf("disk-unhealthy(%t) ", diskUnhealthy)
557+
}
552558
if reason != "" {
553559
should = true
554560
reason += "change"

pkg/roachprod/opentelemetry/cockroachdb_metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1924,6 +1924,7 @@ var cockroachdbMetrics = map[string]string{
19241924
"storage_disk_read_time": "storage.disk.read.time",
19251925
"storage_disk_slow": "storage.disk_slow",
19261926
"storage_disk_stalled": "storage.disk_stalled",
1927+
"storage_disk_unhealthy_duration": "storage.disk-unhealthy.duration",
19271928
"storage_disk_weightedio_time": "storage.disk.weightedio.time",
19281929
"storage_disk_write_bytes": "storage.disk.write.bytes",
19291930
"storage_disk_write_count": "storage.disk.write.count",

pkg/server/node.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,10 +1374,12 @@ func (pmp *nodePebbleMetricsProvider) GetPebbleMetrics() []admission.StoreMetric
13741374
if s, ok := storeIDToDiskStats[store.StoreID()]; ok {
13751375
diskStats = s
13761376
}
1377+
diskUnhealthy := eng.GetDiskUnhealthy()
13771378
metrics = append(metrics, admission.StoreMetrics{
13781379
StoreID: store.StoreID(),
13791380
Metrics: m.Metrics,
13801381
WriteStallCount: m.WriteStallCount,
1382+
DiskUnhealthy: diskUnhealthy,
13811383
DiskStats: diskStats,
13821384
MemTableSizeForStopWrites: memTableSizeForStopWrites,
13831385
})

pkg/storage/engine.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,17 @@ type Engine interface {
11341134
// GetPebbleOptions returns the options used when creating the engine. The
11351135
// caller must not modify these.
11361136
GetPebbleOptions() *pebble.Options
1137+
1138+
// GetDiskUnhealthy returns true if the engine has determined that the
1139+
// underlying disk is transiently unhealthy. This can change from false to
1140+
// true and back to false. The engine has mechanisms to mask disk unhealth
1141+
// (e.g. if WAL failover is configured), but in some cases the unhealth is
1142+
// longer than what the engine may be able to successfully mask, but not yet
1143+
// long enough to crash the node (see
1144+
// COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT). This method returns true in
1145+
// this intermediate case. Currently, this mainly feeds into allocation
1146+
// decisions by the caller (such as shedding leases).
1147+
GetDiskUnhealthy() bool
11371148
}
11381149

11391150
// Batch is the interface for batch specific operations.
@@ -1250,6 +1261,9 @@ type Metrics struct {
12501261
// distinguished in the pebble logs.
12511262
WriteStallCount int64
12521263
WriteStallDuration time.Duration
1264+
// DiskUnhealthyDuration is the duration for which Engine.GetUnhealthyDisk
1265+
// has returned true.
1266+
DiskUnhealthyDuration time.Duration
12531267

12541268
// BlockLoadConcurrencyLimit is the current limit on the number of concurrent
12551269
// sstable block reads.

0 commit comments

Comments
 (0)