cockroachdb
diff --git a/‎docs/generated/metrics/metrics.yaml‎
Lines changed: 8 additions & 0 deletions b/‎docs/generated/metrics/metrics.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/generated/settings/settings.html‎
Lines changed: 1 addition & 0 deletions b/‎docs/generated/settings/settings.html‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator.go‎
Lines changed: 1 addition & 0 deletions b/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go‎
Lines changed: 47 additions & 20 deletions b/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go‎
Lines changed: 47 additions & 20 deletions
diff --git a/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go‎
Lines changed: 42 additions & 0 deletions b/‎pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎pkg/kv/kvserver/metrics.go‎
Lines changed: 13 additions & 5 deletions b/‎pkg/kv/kvserver/metrics.go‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎pkg/kv/kvserver/store_gossip.go‎
Lines changed: 6 additions & 0 deletions b/‎pkg/kv/kvserver/store_gossip.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/roachprod/opentelemetry/cockroachdb_metrics.go‎
Lines changed: 1 addition & 0 deletions b/‎pkg/roachprod/opentelemetry/cockroachdb_metrics.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/server/node.go‎
Lines changed: 2 additions & 0 deletions b/‎pkg/server/node.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/storage/engine.go‎
Lines changed: 14 additions & 0 deletions b/‎pkg/storage/engine.go‎
Lines changed: 14 additions & 0 deletions
@@ -16351,6 +16351,14 @@ layers:
       unit: COUNT
       aggregation: AVG
       derivative: NON_NEGATIVE_DERIVATIVE
+    - name: storage.disk-unhealthy.duration
+      exported_name: storage_disk_unhealthy_duration
+      description: Total disk unhealthy duration in nanos
+      y_axis_label: Nanoseconds
+      type: COUNTER
+      unit: NANOSECONDS
+      aggregation: AVG
+      derivative: NON_NEGATIVE_DERIVATIVE
     - name: storage.disk.io.time
       exported_name: storage_disk_io_time
       description: Time spent reading from or writing to the store's disk since this process started (as reported by the OS)
 
@@ -367,6 +367,7 @@
 <tr><td><div id="setting-storage-sstable-compression-algorithm" class="anchored"><code>storage.sstable.compression_algorithm</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for use in a Pebble store (balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, balanced = 6, good = 7]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
 <tr><td><div id="setting-storage-sstable-compression-algorithm-backup-storage" class="anchored"><code>storage.sstable.compression_algorithm_backup_storage</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup row data storage (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
 <tr><td><div id="setting-storage-sstable-compression-algorithm-backup-transport" class="anchored"><code>storage.sstable.compression_algorithm_backup_transport</code></div></td><td>enumeration</td><td><code>fastest</code></td><td>determines the compression algorithm to use when compressing sstable data blocks for backup transport (fast,balanced,good are experimental); [snappy = 1, zstd = 2, none = 3, minlz = 4, fastest = 5, fast = 6, balanced = 7, good = 8]</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
+<tr><td><div id="setting-storage-unhealthy-write-duration" class="anchored"><code>storage.unhealthy_write_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>duration for disk write operations, beyond which the disk will be reported as unhealthy for higher layer actions</td><td>Advanced/Self-Hosted</td></tr>
 <tr><td><div id="setting-storage-wal-failover-unhealthy-op-threshold" class="anchored"><code>storage.wal_failover.unhealthy_op_threshold</code></div></td><td>duration</td><td><code>100ms</code></td><td>the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location</td><td>Advanced/Self-Hosted</td></tr>
 <tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Advanced/Self-Hosted</td></tr>
 <tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Advanced/Self-hosted (read-write); Basic/Standard (read-only)</td></tr>
 
@@ -2358,6 +2358,7 @@ func (a *Allocator) IOOverloadOptions() IOOverloadOptions {
 		ReplicaIOOverloadThreshold:   ReplicaIOOverloadThreshold.Get(&a.st.SV),
 		LeaseIOOverloadThreshold:     LeaseIOOverloadThreshold.Get(&a.st.SV),
 		LeaseIOOverloadShedThreshold: LeaseIOOverloadShedThreshold.Get(&a.st.SV),
+		DiskUnhealthyScore:           DiskUnhealthyIOOverloadScore.Get(&a.st.SV),
 	}
 }
 
 
@@ -229,6 +229,20 @@ var LeaseIOOverloadThresholdEnforcement = settings.RegisterEnumSetting(
 	},
 )
 
+// DiskUnhealthyIOOverloadScore is the IO overload score assigned to a store
+// when its disk is considered unhealthy. The value set here will be used in
+// the context of {LeaseIOOverloadThreshold, LeaseIOOverloadShedThreshold,
+// LeaseIOOverloadThresholdEnforcement} to shed leases, and in the context of
+// {ReplicaIOOverloadThreshold, ReplicaIOOverloadThresholdEnforcement}, when
+// transferring replicas.
+//
+// TODO(sumeer): change to DefaultLeaseIOOverloadShedThreshold after discussion.
+var DiskUnhealthyIOOverloadScore = settings.RegisterFloatSetting(
+	settings.SystemOnly,
+	"kv.allocator.disk_unhealthy_io_overload_score",
+	"the IO overload score to assign to a store when its disk is unhealthy",
+	0)
+
 // maxDiskUtilizationThreshold controls the point at which the store cedes
 // having room for new replicas. If the fraction used of a store descriptor
 // capacity is greater than this value, it will never be used as a rebalance or
@@ -2466,42 +2480,55 @@ type IOOverloadOptions struct {
 	ReplicaIOOverloadThreshold   float64
 	LeaseIOOverloadThreshold     float64
 	LeaseIOOverloadShedThreshold float64
+
+	DiskUnhealthyScore float64
 }
 
 func ioOverloadCheck(
-	score, avg, absThreshold, meanThreshold float64,
+	overloadScore, overloadAvg, diskUnhealthyScore, absThreshold, meanThreshold float64,
 	enforcement IOOverloadEnforcementLevel,
 	disallowed ...IOOverloadEnforcementLevel,
 ) (ok bool, reason string) {
-	absCheck := score < absThreshold
-	meanCheck := score < avg*meanThreshold
+	absCheck := overloadScore < absThreshold
+	meanCheck := overloadScore < overloadAvg*meanThreshold
+	// We do not bother with the mean for the disk unhealthy score, because disk
+	// unhealthiness is rare, and also because this code was bolted on later
+	// (and was simpler to ignore the mean).
+	//
+	// TODO(sumeer): revisit this if this turns out to be useful, and do a
+	// cleaner version for MMA.
+	diskCheck := diskUnhealthyScore < epsilon || diskUnhealthyScore < absThreshold
 
 	// The score needs to be no less than both the average threshold and the
 	// absolute threshold in order to be considered IO overloaded.
-	if absCheck || meanCheck {
+	if (absCheck || meanCheck) && diskCheck {
 		return true, ""
 	}
 
 	for _, disallowedEnforcement := range disallowed {
 		if enforcement == disallowedEnforcement {
 			return false, fmt.Sprintf(
-				"io overload %.2f exceeds threshold %.2f, above average: %.2f, enforcement %d",
-				score, absThreshold, avg, enforcement)
+				"io overload %.2f (disk %.2f) exceeds threshold %.2f, above average: %.2f, enforcement %d",
+				overloadScore, diskUnhealthyScore, absThreshold, overloadAvg, enforcement)
 		}
 	}
 
 	return true, ""
 }
 
-func (o IOOverloadOptions) storeScore(store roachpb.StoreDescriptor) float64 {
-	var score float64
+func (o IOOverloadOptions) storeScore(
+	store roachpb.StoreDescriptor,
+) (overloadScore float64, diskUnhealthyScore float64) {
 	if o.UseIOThresholdMax {
-		score, _ = store.Capacity.IOThresholdMax.Score()
+		overloadScore, _ = store.Capacity.IOThresholdMax.Score()
 	} else {
-		score, _ = store.Capacity.IOThreshold.Score()
+		overloadScore, _ = store.Capacity.IOThreshold.Score()
 	}
-
-	return score
+	diskUnhealthyScore = 0
+	if store.Capacity.IOThreshold.DiskUnhealthy {
+		diskUnhealthyScore = o.DiskUnhealthyScore
+	}
+	return overloadScore, diskUnhealthyScore
 }
 
 func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) float64 {
@@ -2517,10 +2544,10 @@ func (o IOOverloadOptions) storeListAvgScore(storeList storepool.StoreList) floa
 func (o IOOverloadOptions) allocateReplicaToCheck(
 	ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
 ) bool {
-	score := o.storeScore(store)
+	score, diskUnhealthyScore := o.storeScore(store)
 	avg := o.storeListAvgScore(storeList)
 
-	if ok, reason := ioOverloadCheck(score, avg,
+	if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
 		o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
 		o.ReplicaEnforcementLevel,
 		IOOverloadThresholdBlockAll,
@@ -2538,10 +2565,10 @@ func (o IOOverloadOptions) allocateReplicaToCheck(
 func (o IOOverloadOptions) rebalanceReplicaToCheck(
 	ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
 ) bool {
-	score := o.storeScore(store)
+	score, diskUnhealthyScore := o.storeScore(store)
 	avg := o.storeListAvgScore(storeList)
 
-	if ok, reason := ioOverloadCheck(score, avg,
+	if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
 		o.ReplicaIOOverloadThreshold, IOOverloadMeanThreshold,
 		o.ReplicaEnforcementLevel,
 		IOOverloadThresholdBlockTransfers, IOOverloadThresholdBlockAll,
@@ -2558,10 +2585,10 @@ func (o IOOverloadOptions) rebalanceReplicaToCheck(
 func (o IOOverloadOptions) transferLeaseToCheck(
 	ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
 ) bool {
-	score := o.storeScore(store)
+	score, diskUnhealthyScore := o.storeScore(store)
 	avg := o.storeListAvgScore(storeList)
 
-	if ok, reason := ioOverloadCheck(score, avg,
+	if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
 		o.LeaseIOOverloadThreshold, IOOverloadMeanThreshold,
 		o.LeaseEnforcementLevel,
 		IOOverloadThresholdBlockTransfers, IOOverloadThresholdShed,
@@ -2579,10 +2606,10 @@ func (o IOOverloadOptions) transferLeaseToCheck(
 func (o IOOverloadOptions) ExistingLeaseCheck(
 	ctx context.Context, store roachpb.StoreDescriptor, storeList storepool.StoreList,
 ) bool {
-	score := o.storeScore(store)
+	score, diskUnhealthyScore := o.storeScore(store)
 	avg := o.storeListAvgScore(storeList)
 
-	if ok, reason := ioOverloadCheck(score, avg,
+	if ok, reason := ioOverloadCheck(score, avg, diskUnhealthyScore,
 		o.LeaseIOOverloadShedThreshold, IOOverloadMeanShedThreshold,
 		o.LeaseEnforcementLevel,
 		IOOverloadThresholdShed,
 
@@ -2179,3 +2179,45 @@ func TestCandidateListString(t *testing.T) {
 		"s3, valid:false, fulldisk:false, necessary:false, voterNecessary:false, diversity:1.00, ioOverloaded: false, ioOverload: 1.00, converges:-1, balance:-1, hasNonVoter:false, rangeCount:3, queriesPerSecond:0.00, details:(mock detail 3)]",
 		cl.String())
 }
+
+func TestIOOverloadOptionsDiskUnhealthy(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	ctx := context.Background()
+	options := IOOverloadOptions{
+		UseIOThresholdMax:            false,
+		ReplicaIOOverloadThreshold:   0.2,
+		LeaseIOOverloadThreshold:     0.2,
+		LeaseIOOverloadShedThreshold: 0.2,
+		DiskUnhealthyScore:           0.3,
+	}
+
+	store := roachpb.StoreDescriptor{}
+	o := options
+	o.ReplicaEnforcementLevel = IOOverloadThresholdBlockAll
+	require.True(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
+	store.Capacity.IOThreshold.DiskUnhealthy = true
+	require.False(t, o.allocateReplicaToCheck(ctx, store, storepool.StoreList{}))
+
+	store = roachpb.StoreDescriptor{}
+	o = options
+	o.ReplicaEnforcementLevel = IOOverloadThresholdBlockTransfers
+	require.True(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
+	store.Capacity.IOThreshold.DiskUnhealthy = true
+	require.False(t, o.rebalanceReplicaToCheck(ctx, store, storepool.StoreList{}))
+
+	store = roachpb.StoreDescriptor{}
+	o = options
+	o.LeaseEnforcementLevel = IOOverloadThresholdShed
+	require.True(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
+	store.Capacity.IOThreshold.DiskUnhealthy = true
+	require.False(t, o.ExistingLeaseCheck(ctx, store, storepool.StoreList{}))
+
+	store = roachpb.StoreDescriptor{}
+	o = options
+	o.LeaseEnforcementLevel = IOOverloadThresholdBlockTransfers
+	require.True(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
+	store.Capacity.IOThreshold.DiskUnhealthy = true
+	require.False(t, o.transferLeaseToCheck(ctx, store, storepool.StoreList{}))
+}
@@ -1247,7 +1247,12 @@ var (
 		Measurement: "Events",
 		Unit:        metric.Unit_COUNT,
 	}
-
+	metaDiskUnhealthyDuration = metric.Metadata{
+		Name:        "storage.disk-unhealthy.duration",
+		Help:        "Total disk unhealthy duration in nanos",
+		Measurement: "Nanoseconds",
+		Unit:        metric.Unit_NANOSECONDS,
+	}
 	// Range event metrics.
 	metaRangeSplits = metric.Metadata{
 		Name:        "range.splits",
@@ -3111,8 +3116,9 @@ type StoreMetrics struct {
 	RdbCheckpoints *metric.Gauge
 
 	// Disk health metrics.
-	DiskSlow    *metric.Counter
-	DiskStalled *metric.Counter
+	DiskSlow              *metric.Counter
+	DiskStalled           *metric.Counter
+	DiskUnhealthyDuration *metric.Counter
 
 	// TODO(mrtracy): This should be removed as part of #4465. This is only
 	// maintained to keep the current structure of NodeStatus; it would be
@@ -3868,8 +3874,9 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
 		RdbCheckpoints: metric.NewGauge(metaRdbCheckpoints),
 
 		// Disk health metrics.
-		DiskSlow:    metric.NewCounter(metaDiskSlow),
-		DiskStalled: metric.NewCounter(metaDiskStalled),
+		DiskSlow:              metric.NewCounter(metaDiskSlow),
+		DiskStalled:           metric.NewCounter(metaDiskStalled),
+		DiskUnhealthyDuration: metric.NewCounter(metaDiskUnhealthyDuration),
 
 		// Range event metrics.
 		RangeSplits:                   metric.NewCounter(metaRangeSplits),
@@ -4260,6 +4267,7 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
 	sm.RdbWriteStallNanos.Update(m.WriteStallDuration.Nanoseconds())
 	sm.DiskSlow.Update(m.DiskSlowCount)
 	sm.DiskStalled.Update(m.DiskStallCount)
+	sm.DiskUnhealthyDuration.Update(int64(m.DiskUnhealthyDuration))
 	sm.IterBlockBytes.Update(int64(m.Iterator.BlockBytes))
 	sm.IterBlockBytesInCache.Update(int64(m.Iterator.BlockBytesInCache))
 	sm.IterBlockReadDuration.Update(int64(m.Iterator.BlockReadDuration))
 
@@ -522,6 +522,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
 	lastGossipMaxIOScore, _ := s.cachedCapacity.lastGossiped.IOThresholdMax.Score()
 	updateForMaxIOOverloadScore := cachedMaxIOScore >= gossipMinMaxIOOverloadScore &&
 		cachedMaxIOScore > lastGossipMaxIOScore
+	diskUnhealthy := s.cachedCapacity.cached.IOThreshold.DiskUnhealthy
+	updateForChangeInDiskUnhealth :=
+		s.cachedCapacity.lastGossiped.IOThreshold.DiskUnhealthy != diskUnhealthy
 	s.cachedCapacity.Unlock()
 
 	if s.knobs.DisableLeaseCapacityGossip {
@@ -549,6 +552,9 @@ func (s *StoreGossip) shouldGossipOnCapacityDelta() (should bool, reason string)
 	if updateForMaxIOOverloadScore {
 		reason += fmt.Sprintf("io-overload(%.1f) ", cachedMaxIOScore)
 	}
+	if updateForChangeInDiskUnhealth {
+		reason += fmt.Sprintf("disk-unhealthy(%t) ", diskUnhealthy)
+	}
 	if reason != "" {
 		should = true
 		reason += "change"
 
@@ -1924,6 +1924,7 @@ var cockroachdbMetrics = map[string]string{
 	"storage_disk_read_time":                                      "storage.disk.read.time",
 	"storage_disk_slow":                                           "storage.disk_slow",
 	"storage_disk_stalled":                                        "storage.disk_stalled",
+	"storage_disk_unhealthy_duration":                             "storage.disk-unhealthy.duration",
 	"storage_disk_weightedio_time":                                "storage.disk.weightedio.time",
 	"storage_disk_write_bytes":                                    "storage.disk.write.bytes",
 	"storage_disk_write_count":                                    "storage.disk.write.count",
 
@@ -1374,10 +1374,12 @@ func (pmp *nodePebbleMetricsProvider) GetPebbleMetrics() []admission.StoreMetric
 		if s, ok := storeIDToDiskStats[store.StoreID()]; ok {
 			diskStats = s
 		}
+		diskUnhealthy := eng.GetDiskUnhealthy()
 		metrics = append(metrics, admission.StoreMetrics{
 			StoreID:                   store.StoreID(),
 			Metrics:                   m.Metrics,
 			WriteStallCount:           m.WriteStallCount,
+			DiskUnhealthy:             diskUnhealthy,
 			DiskStats:                 diskStats,
 			MemTableSizeForStopWrites: memTableSizeForStopWrites,
 		})
 
@@ -1134,6 +1134,17 @@ type Engine interface {
 	// GetPebbleOptions returns the options used when creating the engine. The
 	// caller must not modify these.
 	GetPebbleOptions() *pebble.Options
+
+	// GetDiskUnhealthy returns true if the engine has determined that the
+	// underlying disk is transiently unhealthy. This can change from false to
+	// true and back to false. The engine has mechanisms to mask disk unhealth
+	// (e.g. if WAL failover is configured), but in some cases the unhealth is
+	// longer than what the engine may be able to successfully mask, but not yet
+	// long enough to crash the node (see
+	// COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT). This method returns true in
+	// this intermediate case. Currently, this mainly feeds into allocation
+	// decisions by the caller (such as shedding leases).
+	GetDiskUnhealthy() bool
 }
 
 // Batch is the interface for batch specific operations.
@@ -1250,6 +1261,9 @@ type Metrics struct {
 	// distinguished in the pebble logs.
 	WriteStallCount    int64
 	WriteStallDuration time.Duration
+	// DiskUnhealthyDuration is the duration for which Engine.GetUnhealthyDisk
+	// has returned true.
+	DiskUnhealthyDuration time.Duration
 
 	// BlockLoadConcurrencyLimit is the current limit on the number of concurrent
 	// sstable block reads.
Original file line number	Diff line number	Diff line change
`@@ -2358,6 +2358,7 @@ func (a *Allocator) IOOverloadOptions() IOOverloadOptions {`
`2358`	`2358`	`ReplicaIOOverloadThreshold: ReplicaIOOverloadThreshold.Get(&a.st.SV),`
`2359`	`2359`	`LeaseIOOverloadThreshold: LeaseIOOverloadThreshold.Get(&a.st.SV),`
`2360`	`2360`	`LeaseIOOverloadShedThreshold: LeaseIOOverloadShedThreshold.Get(&a.st.SV),`
	`2361`	`+ DiskUnhealthyScore: DiskUnhealthyIOOverloadScore.Get(&a.st.SV),`
`2361`	`2362`	`}`
`2362`	`2363`	`}`
`2363`	`2364`