Skip to content

Commit 4187d41

Browse files
committed
roachtestutil: disk stall Unstall returns error
Depending on the context of the test, Unstall erroring out should result in a test failure. Some tests purposely induce a node panic, in which case an error is expected and can be safely ignored. However, some tests want to test that WAL Failover works and any node death is unexpected. This change modifies the disk stall API to return an error for Unstall, and lets the test writer decide how to handle it.
1 parent 75b1a55 commit 4187d41

File tree

5 files changed

+39
-21
lines changed

5 files changed

+39
-21
lines changed

pkg/cmd/roachtest/operations/disk_stall.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ type cleanupDiskStall struct {
2424
}
2525

2626
func (cl *cleanupDiskStall) Cleanup(ctx context.Context, o operation.Operation, c cluster.Cluster) {
27-
cl.staller.Unstall(ctx, cl.nodes)
27+
if err := cl.staller.Unstall(ctx, cl.nodes); err != nil {
28+
o.Fatalf("failed to unstall disk: %v", err)
29+
}
2830
o.Status("unstalled nodes; waiting 10 seconds before restarting")
2931
time.Sleep(10 * time.Second)
3032
// We might need to restart the node if it isn't live.

pkg/cmd/roachtest/roachtestutil/disk_stall.go

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1515
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
1616
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
17+
"github.com/cockroachdb/errors"
1718
)
1819

1920
// TODO(darryl): Once the failure injection library is a first class citizen of roachtest,
@@ -25,7 +26,7 @@ type DiskStaller interface {
2526
Stall(ctx context.Context, nodes option.NodeListOption)
2627
StallCycle(ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration)
2728
Slow(ctx context.Context, nodes option.NodeListOption, bytesPerSecond int)
28-
Unstall(ctx context.Context, nodes option.NodeListOption)
29+
Unstall(ctx context.Context, nodes option.NodeListOption) error
2930
DataDir() string
3031
LogDir() string
3132
}
@@ -34,16 +35,17 @@ type NoopDiskStaller struct{}
3435

3536
var _ DiskStaller = NoopDiskStaller{}
3637

37-
func (n NoopDiskStaller) Cleanup(ctx context.Context) {}
38-
func (n NoopDiskStaller) DataDir() string { return "{store-dir}" }
39-
func (n NoopDiskStaller) LogDir() string { return "logs" }
40-
func (n NoopDiskStaller) Setup(ctx context.Context) {}
41-
func (n NoopDiskStaller) Slow(_ context.Context, _ option.NodeListOption, _ int) {}
42-
func (n NoopDiskStaller) Stall(_ context.Context, _ option.NodeListOption) {}
38+
func (n NoopDiskStaller) Cleanup(ctx context.Context) {}
39+
func (n NoopDiskStaller) DataDir() string { return "{store-dir}" }
40+
func (n NoopDiskStaller) LogDir() string { return "logs" }
41+
func (n NoopDiskStaller) Setup(ctx context.Context) {}
42+
func (n NoopDiskStaller) Slow(_ context.Context, _ option.NodeListOption, _ int) {}
43+
func (n NoopDiskStaller) Stall(_ context.Context, _ option.NodeListOption) {}
4344
func (n NoopDiskStaller) StallCycle(
4445
_ context.Context, _ option.NodeListOption, _, _ time.Duration,
45-
) {}
46-
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) {}
46+
) {
47+
}
48+
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) error { return nil }
4749

4850
type Fataler interface {
4951
Fatal(args ...interface{})
@@ -140,11 +142,12 @@ func (s *cgroupDiskStaller) Slow(
140142
}
141143
}
142144

143-
func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
145+
func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) error {
144146
l := newDiskStallLogger(s.f.L(), nodes, "Unstall")
145-
if err := s.Failer.Recover(ctx, l); err != nil {
146-
s.f.Fatalf("failed to unstall disk: %s", err)
147-
}
147+
// cgroup may fail when unstalling the disk, usually because the node already
148+
// fataled and the cgroup is no longer available. Return the error and let
149+
// the caller decide if a node fatal is expected or not.
150+
return errors.Wrap(s.Failer.Recover(ctx, l), "failed to unstall disk")
148151
}
149152

150153
type dmsetupDiskStaller struct {
@@ -211,11 +214,13 @@ func (s *dmsetupDiskStaller) Slow(
211214
s.f.Fatal("Slow is not supported for dmsetupDiskStaller")
212215
}
213216

214-
func (s *dmsetupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
217+
func (s *dmsetupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) error {
215218
l := newDiskStallLogger(s.f.L(), nodes, "Unstall")
219+
// Any unstall error for dmsetup is unexpected and should fail the test.
216220
if err := s.Failer.Recover(ctx, l); err != nil {
217221
s.f.Fatalf("failed to unstall disk: %s", err)
218222
}
223+
return nil
219224
}
220225

221226
func (s *dmsetupDiskStaller) DataDir() string { return "{store-dir}" }

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,9 @@ func runDiskStalledWALFailover(ctx context.Context, t test.Test, c cluster.Clust
129129
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
130130
defer cancel()
131131
t.Status("Unstalling disk on n1")
132-
s.Unstall(ctx, c.Node(1))
132+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
133+
t.Fatal(err)
134+
}
133135
t.Status("Unstalled disk on n1")
134136
}()
135137

@@ -378,7 +380,11 @@ func runDiskStalledDetection(
378380
}
379381

380382
// Unstall the stalled node. It should be able to be reaped.
381-
s.Unstall(ctx, c.Node(1))
383+
// Note we only log errors since cgroup unstall is expected to fail due to
384+
// nodes panicking from a detected disk stall.
385+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
386+
t.L().Printf("failed to unstall disk: %v", err)
387+
}
382388
time.Sleep(1 * time.Second)
383389
exit, ok = getProcessExitMonotonic(ctx, t, c, 1)
384390
if doStall {
@@ -622,7 +628,9 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
622628
case <-ctx.Done():
623629
t.Fatalf("context done while stall induced: %s", ctx.Err())
624630
case <-time.After(operationDur):
625-
s.Unstall(ctx, c.Node(1))
631+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
632+
t.Fatal(err)
633+
}
626634
t.Status("disk stalls stopped")
627635
}
628636
return nil

pkg/cmd/roachtest/tests/failover.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,7 +1605,6 @@ func (f *diskStallFailer) Setup(ctx context.Context) {
16051605
}
16061606

16071607
func (f *diskStallFailer) Cleanup(ctx context.Context) {
1608-
f.staller.Unstall(ctx, f.c.All())
16091608
// We have to stop the cluster before cleaning up the staller.
16101609
f.m.ExpectDeaths(int32(f.c.Spec().NodeCount))
16111610
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.All())
@@ -1628,7 +1627,9 @@ func (f *diskStallFailer) Fail(ctx context.Context, nodeID int) {
16281627
}
16291628

16301629
func (f *diskStallFailer) Recover(ctx context.Context, nodeID int) {
1631-
f.staller.Unstall(ctx, f.c.Node(nodeID))
1630+
if err := f.staller.Unstall(ctx, f.c.Node(nodeID)); err != nil {
1631+
f.t.Fatalf("failed to unstall disk %v", err)
1632+
}
16321633
// Pebble's disk stall detector should have terminated the node, but in case
16331634
// it didn't, we explicitly stop it first.
16341635
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.Node(nodeID))

pkg/cmd/roachtest/tests/perturbation/slow_disk.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ func (s *slowDisk) startPerturbation(ctx context.Context, t test.Test, v variati
8080

8181
// endPerturbation implements perturbation.
8282
func (s *slowDisk) endPerturbation(ctx context.Context, t test.Test, v variations) time.Duration {
83-
s.staller.Unstall(ctx, v.targetNodes())
83+
if err := s.staller.Unstall(ctx, v.targetNodes()); err != nil {
84+
t.Fatal("failed to unstall disk:", err)
85+
}
8486
waitDuration(ctx, v.validationDuration)
8587
return v.validationDuration
8688
}

0 commit comments

Comments
 (0)