Skip to content

Commit e25ecb8

Browse files
craig[bot]DarrylWong
andcommitted
Merge #148704
148704: failureinjection: add disk stall cycle option r=xinhaoz a=DarrylWong Some usages of disk stalls wish to repeatedly stall and unstall a disk in a short period of time. The current implementation requires an ssh connection per stall/unstall call, which can easily lead to flakes. e.g. the unstall call runs into an ssh flake and the node fatals before it can be recovered. This change adds a disk stall cycle option which will do this in one single ssh connection until Recover is called. Fixes: #148345 Release note: none Co-authored-by: DarrylWong <[email protected]>
2 parents 5069471 + 4187d41 commit e25ecb8

File tree

8 files changed

+262
-84
lines changed

8 files changed

+262
-84
lines changed

pkg/cmd/roachtest/operations/disk_stall.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ type cleanupDiskStall struct {
2424
}
2525

2626
func (cl *cleanupDiskStall) Cleanup(ctx context.Context, o operation.Operation, c cluster.Cluster) {
27-
cl.staller.Unstall(ctx, cl.nodes)
27+
if err := cl.staller.Unstall(ctx, cl.nodes); err != nil {
28+
o.Fatalf("failed to unstall disk: %v", err)
29+
}
2830
o.Status("unstalled nodes; waiting 10 seconds before restarting")
2931
time.Sleep(10 * time.Second)
3032
// We might need to restart the node if it isn't live.

pkg/cmd/roachtest/roachtestutil/disk_stall.go

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ package roachtestutil
77

88
import (
99
"context"
10+
"time"
1011

1112
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1213
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1314
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1415
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
1516
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
17+
"github.com/cockroachdb/errors"
1618
)
1719

1820
// TODO(darryl): Once the failure injection library is a first class citizen of roachtest,
@@ -22,8 +24,9 @@ type DiskStaller interface {
2224
Setup(ctx context.Context)
2325
Cleanup(ctx context.Context)
2426
Stall(ctx context.Context, nodes option.NodeListOption)
27+
StallCycle(ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration)
2528
Slow(ctx context.Context, nodes option.NodeListOption, bytesPerSecond int)
26-
Unstall(ctx context.Context, nodes option.NodeListOption)
29+
Unstall(ctx context.Context, nodes option.NodeListOption) error
2730
DataDir() string
2831
LogDir() string
2932
}
@@ -38,7 +41,11 @@ func (n NoopDiskStaller) LogDir() string
3841
func (n NoopDiskStaller) Setup(ctx context.Context) {}
3942
func (n NoopDiskStaller) Slow(_ context.Context, _ option.NodeListOption, _ int) {}
4043
func (n NoopDiskStaller) Stall(_ context.Context, _ option.NodeListOption) {}
41-
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) {}
44+
func (n NoopDiskStaller) StallCycle(
45+
_ context.Context, _ option.NodeListOption, _, _ time.Duration,
46+
) {
47+
}
48+
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) error { return nil }
4249

4350
type Fataler interface {
4451
Fatal(args ...interface{})
@@ -103,6 +110,23 @@ func (s *cgroupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOpti
103110
}
104111
}
105112

113+
func (s *cgroupDiskStaller) StallCycle(
114+
ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration,
115+
) {
116+
l := newDiskStallLogger(s.f.L(), nodes, "Stall")
117+
if err := s.Failer.Inject(ctx, l, failures.DiskStallArgs{
118+
StallLogs: s.stallLogs,
119+
StallWrites: true,
120+
StallReads: s.stallReads,
121+
Nodes: nodes.InstallNodes(),
122+
Cycle: true,
123+
CycleStallDuration: stallDuration,
124+
CycleUnstallDuration: unstallDuration,
125+
}); err != nil {
126+
s.f.Fatalf("failed to stall disk: %s", err)
127+
}
128+
}
129+
106130
func (s *cgroupDiskStaller) Slow(
107131
ctx context.Context, nodes option.NodeListOption, bytesPerSecond int,
108132
) {
@@ -118,11 +142,12 @@ func (s *cgroupDiskStaller) Slow(
118142
}
119143
}
120144

121-
func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
145+
func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) error {
122146
l := newDiskStallLogger(s.f.L(), nodes, "Unstall")
123-
if err := s.Failer.Recover(ctx, l); err != nil {
124-
s.f.Fatalf("failed to unstall disk: %s", err)
125-
}
147+
// cgroup may fail when unstalling the disk, usually because the node already
148+
// fataled and the cgroup is no longer available. Return the error and let
149+
// the caller decide if a node fatal is expected or not.
150+
return errors.Wrap(s.Failer.Recover(ctx, l), "failed to unstall disk")
126151
}
127152

128153
type dmsetupDiskStaller struct {
@@ -168,18 +193,34 @@ func (s *dmsetupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOpt
168193
}
169194
}
170195

196+
func (s *dmsetupDiskStaller) StallCycle(
197+
ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration,
198+
) {
199+
l := newDiskStallLogger(s.f.L(), nodes, "Stall")
200+
if err := s.Failer.Inject(ctx, l, failures.DiskStallArgs{
201+
Nodes: nodes.InstallNodes(),
202+
Cycle: true,
203+
CycleStallDuration: stallDuration,
204+
CycleUnstallDuration: unstallDuration,
205+
}); err != nil {
206+
s.f.Fatalf("failed to stall disk: %s", err)
207+
}
208+
}
209+
171210
func (s *dmsetupDiskStaller) Slow(
172211
ctx context.Context, nodes option.NodeListOption, bytesPerSecond int,
173212
) {
174213
// TODO(baptist): Consider https://github.com/kawamuray/ddi.
175214
s.f.Fatal("Slow is not supported for dmsetupDiskStaller")
176215
}
177216

178-
func (s *dmsetupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
217+
func (s *dmsetupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) error {
179218
l := newDiskStallLogger(s.f.L(), nodes, "Unstall")
219+
// Any unstall error for dmsetup is unexpected and should fail the test.
180220
if err := s.Failer.Recover(ctx, l); err != nil {
181221
s.f.Fatalf("failed to unstall disk: %s", err)
182222
}
223+
return nil
183224
}
184225

185226
func (s *dmsetupDiskStaller) DataDir() string { return "{store-dir}" }

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,9 @@ func runDiskStalledWALFailover(ctx context.Context, t test.Test, c cluster.Clust
129129
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
130130
defer cancel()
131131
t.Status("Unstalling disk on n1")
132-
s.Unstall(ctx, c.Node(1))
132+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
133+
t.Fatal(err)
134+
}
133135
t.Status("Unstalled disk on n1")
134136
}()
135137

@@ -378,7 +380,11 @@ func runDiskStalledDetection(
378380
}
379381

380382
// Unstall the stalled node. It should be able to be reaped.
381-
s.Unstall(ctx, c.Node(1))
383+
// Note we only log errors since cgroup unstall is expected to fail due to
384+
// nodes panicking from a detected disk stall.
385+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
386+
t.L().Printf("failed to unstall disk: %v", err)
387+
}
382388
time.Sleep(1 * time.Second)
383389
exit, ok = getProcessExitMonotonic(ctx, t, c, 1)
384390
if doStall {
@@ -462,7 +468,11 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
462468
// Use CgroupDiskStaller with readsToo=false to only stall writes.
463469
s := roachtestutil.MakeCgroupDiskStaller(t, c, false /* readsToo */, false /* logsToo */)
464470
s.Setup(ctx)
465-
defer s.Cleanup(ctx)
471+
// NB: We use a background context in the defer'ed cleanup command,
472+
// otherwise on test failure our c.Run calls will be ignored. Leaving
473+
// the disk stalled will prevent artifact collection, making debugging
474+
// difficult.
475+
defer s.Cleanup(context.Background())
466476

467477
t.Status("starting cluster")
468478
startOpts := option.DefaultStartOpts()
@@ -612,36 +622,17 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
612622
case <-time.After(diskStallWaitDur):
613623
t.Status("starting disk stall")
614624
}
615-
stallStart := timeutil.Now()
616-
// Execute short 200ms stalls every 5s.
617-
for timeutil.Since(stallStart) < operationDur {
618-
select {
619-
case <-ctx.Done():
620-
t.Fatalf("context done while stall induced: %s", ctx.Err())
621-
case <-time.After(stallInterval):
622-
func() {
623-
t.Status("short disk stall on n1")
624-
s.Stall(ctx, c.Node(1))
625-
defer func() {
626-
// NB: We use a background context in the defer'ed unstall command,
627-
// otherwise on test failure our Unstall calls will be ignored. Leaving
628-
// the disk stalled will prevent artifact collection, making debugging
629-
// difficult.
630-
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
631-
defer cancel()
632-
s.Unstall(ctx, c.Node(1))
633-
t.Status("unstalled disk on n1")
634-
}()
635-
select {
636-
case <-ctx.Done():
637-
t.Fatalf("context done while stall induced: %s", ctx.Err())
638-
case <-time.After(shortStallDur):
639-
return
640-
}
641-
}()
625+
// Execute short 200ms stalls every 5s for 3 minutes.
626+
s.StallCycle(ctx, c.Node(1), shortStallDur, stallInterval)
627+
select {
628+
case <-ctx.Done():
629+
t.Fatalf("context done while stall induced: %s", ctx.Err())
630+
case <-time.After(operationDur):
631+
if err = s.Unstall(ctx, c.Node(1)); err != nil {
632+
t.Fatal(err)
642633
}
634+
t.Status("disk stalls stopped")
643635
}
644-
645636
return nil
646637
}, task.Name("disk-stall-phase"))
647638
} else {

pkg/cmd/roachtest/tests/failover.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1635,7 +1635,9 @@ func (f *diskStallFailer) Fail(ctx context.Context, nodeID int) {
16351635
}
16361636

16371637
func (f *diskStallFailer) Recover(ctx context.Context, nodeID int) {
1638-
f.staller.Unstall(ctx, f.c.Node(nodeID))
1638+
if err := f.staller.Unstall(ctx, f.c.Node(nodeID)); err != nil {
1639+
f.t.Fatalf("failed to unstall disk %v", err)
1640+
}
16391641
// Pebble's disk stall detector should have terminated the node, but in case
16401642
// it didn't, we explicitly stop it first.
16411643
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.Node(nodeID))

pkg/cmd/roachtest/tests/perturbation/slow_disk.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ func (s *slowDisk) startPerturbation(ctx context.Context, t test.Test, v variati
8181

8282
// endPerturbation implements perturbation.
8383
func (s *slowDisk) endPerturbation(ctx context.Context, t test.Test, v variations) time.Duration {
84-
s.staller.Unstall(ctx, v.targetNodes())
84+
if err := s.staller.Unstall(ctx, v.targetNodes()); err != nil {
85+
t.Fatal("failed to unstall disk:", err)
86+
}
8587
waitDuration(ctx, v.validationDuration)
8688
return v.validationDuration
8789
}

0 commit comments

Comments
 (0)