Skip to content

Commit f26d46c

Browse files
craig[bot]erikgrinaker
andcommitted
106893: failover: re-enable disk stall detector in `diskStallFailer.Ready` r=erikgrinaker a=erikgrinaker `pauseFailer` needs to disable the disk stall detector to avoid false positives. However, it attempted to re-enable it via cluster setting during recovery. If a system range is unavailable during recovery (typically in chaos tests with concurrent failures), this can error out. This patch instead (re-)enables the disk stall detector during `diskStallFailer.Ready`. Touches cockroachdb#106681. Touches cockroachdb#106752. Epic: none Release note: None Co-authored-by: Erik Grinaker <[email protected]>
2 parents bfa276d + cd0cc8c commit f26d46c

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

pkg/cmd/roachtest/tests/failover.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1533,7 +1533,6 @@ func (f *diskStallFailer) Mode() failureMode { return failureModeDiskS
15331533
func (f *diskStallFailer) String() string { return string(f.Mode()) }
15341534
func (f *diskStallFailer) CanUseLocal() bool { return false } // needs dmsetup
15351535
func (f *diskStallFailer) CanRunWith(failureMode) bool { return true }
1536-
func (f *diskStallFailer) Ready(context.Context, int) {}
15371536

15381537
func (f *diskStallFailer) Setup(ctx context.Context) {
15391538
f.staller.Setup(ctx)
@@ -1547,6 +1546,15 @@ func (f *diskStallFailer) Cleanup(ctx context.Context) {
15471546
f.staller.Cleanup(ctx)
15481547
}
15491548

1549+
func (f *diskStallFailer) Ready(ctx context.Context, nodeID int) {
1550+
// Other failure modes may have disabled the disk stall detector (see
1551+
// pauseFailer), so we explicitly enable it.
1552+
conn := f.c.Conn(ctx, f.t.L(), nodeID)
1553+
_, err := conn.ExecContext(ctx,
1554+
`SET CLUSTER SETTING storage.max_sync_duration.fatal.enabled = true`)
1555+
require.NoError(f.t, err)
1556+
}
1557+
15501558
func (f *diskStallFailer) Fail(ctx context.Context, nodeID int) {
15511559
// Pebble's disk stall detector should crash the node.
15521560
f.m.ExpectDeath()
@@ -1583,7 +1591,7 @@ func (f *pauseFailer) CanRunWith(other failureMode) bool {
15831591
func (f *pauseFailer) Ready(ctx context.Context, nodeID int) {
15841592
// The process pause can trip the disk stall detector, so we disable it. We
15851593
// could let it fire, but we'd like to see if the node can recover from the
1586-
// pause and keep working.
1594+
// pause and keep working. It will be re-enabled by diskStallFailer.Ready().
15871595
conn := f.c.Conn(ctx, f.t.L(), nodeID)
15881596
_, err := conn.ExecContext(ctx,
15891597
`SET CLUSTER SETTING storage.max_sync_duration.fatal.enabled = false`)
@@ -1596,13 +1604,9 @@ func (f *pauseFailer) Fail(ctx context.Context, nodeID int) {
15961604

15971605
func (f *pauseFailer) Recover(ctx context.Context, nodeID int) {
15981606
f.c.Signal(ctx, f.t.L(), 18, f.c.Node(nodeID)) // SIGCONT
1599-
1600-
// Re-enable disk stall detector, in case we do a disk stall failure after
1601-
// this (e.g. in chaos tests).
1602-
conn := f.c.Conn(ctx, f.t.L(), 1)
1603-
_, err := conn.ExecContext(ctx,
1604-
`SET CLUSTER SETTING storage.max_sync_duration.fatal.enabled = true`)
1605-
require.NoError(f.t, err)
1607+
// NB: We don't re-enable the disk stall detector here, but instead rely on
1608+
// diskStallFailer.Ready to ensure it's enabled, since the cluster likely
1609+
// hasn't recovered yet and we may fail to set the cluster setting.
16061610
}
16071611

16081612
// waitForUpreplication waits for upreplication of ranges that satisfy the

0 commit comments

Comments
 (0)