Skip to content

Commit 75b1a55

Browse files
committed
failureinjection: add disk stall cycle option
Some usages of disk stalls wish to repeatedly stall and unstall a disk in a short period of time. The current implementation requires an ssh connection per stall/unstall call, which can easily lead to flakes. e.g. the unstall call runs into an ssh flake and the node fatals before it can be recovered. This change adds a disk stall cycle option which will do this in one single ssh connection until Recover is called.
1 parent 7ca9db8 commit 75b1a55

File tree

5 files changed

+239
-80
lines changed

5 files changed

+239
-80
lines changed

pkg/cmd/roachtest/roachtestutil/disk_stall.go

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package roachtestutil
77

88
import (
99
"context"
10+
"time"
1011

1112
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1213
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
@@ -22,6 +23,7 @@ type DiskStaller interface {
2223
Setup(ctx context.Context)
2324
Cleanup(ctx context.Context)
2425
Stall(ctx context.Context, nodes option.NodeListOption)
26+
StallCycle(ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration)
2527
Slow(ctx context.Context, nodes option.NodeListOption, bytesPerSecond int)
2628
Unstall(ctx context.Context, nodes option.NodeListOption)
2729
DataDir() string
@@ -32,13 +34,16 @@ type NoopDiskStaller struct{}
3234

3335
var _ DiskStaller = NoopDiskStaller{}
3436

35-
func (n NoopDiskStaller) Cleanup(ctx context.Context) {}
36-
func (n NoopDiskStaller) DataDir() string { return "{store-dir}" }
37-
func (n NoopDiskStaller) LogDir() string { return "logs" }
38-
func (n NoopDiskStaller) Setup(ctx context.Context) {}
39-
func (n NoopDiskStaller) Slow(_ context.Context, _ option.NodeListOption, _ int) {}
40-
func (n NoopDiskStaller) Stall(_ context.Context, _ option.NodeListOption) {}
41-
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) {}
37+
func (n NoopDiskStaller) Cleanup(ctx context.Context) {}
38+
func (n NoopDiskStaller) DataDir() string { return "{store-dir}" }
39+
func (n NoopDiskStaller) LogDir() string { return "logs" }
40+
func (n NoopDiskStaller) Setup(ctx context.Context) {}
41+
func (n NoopDiskStaller) Slow(_ context.Context, _ option.NodeListOption, _ int) {}
42+
func (n NoopDiskStaller) Stall(_ context.Context, _ option.NodeListOption) {}
43+
func (n NoopDiskStaller) StallCycle(
44+
_ context.Context, _ option.NodeListOption, _, _ time.Duration,
45+
) {}
46+
func (n NoopDiskStaller) Unstall(_ context.Context, _ option.NodeListOption) {}
4247

4348
type Fataler interface {
4449
Fatal(args ...interface{})
@@ -103,6 +108,23 @@ func (s *cgroupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOpti
103108
}
104109
}
105110

111+
func (s *cgroupDiskStaller) StallCycle(
112+
ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration,
113+
) {
114+
l := newDiskStallLogger(s.f.L(), nodes, "Stall")
115+
if err := s.Failer.Inject(ctx, l, failures.DiskStallArgs{
116+
StallLogs: s.stallLogs,
117+
StallWrites: true,
118+
StallReads: s.stallReads,
119+
Nodes: nodes.InstallNodes(),
120+
Cycle: true,
121+
CycleStallDuration: stallDuration,
122+
CycleUnstallDuration: unstallDuration,
123+
}); err != nil {
124+
s.f.Fatalf("failed to stall disk: %s", err)
125+
}
126+
}
127+
106128
func (s *cgroupDiskStaller) Slow(
107129
ctx context.Context, nodes option.NodeListOption, bytesPerSecond int,
108130
) {
@@ -168,6 +190,20 @@ func (s *dmsetupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOpt
168190
}
169191
}
170192

193+
func (s *dmsetupDiskStaller) StallCycle(
194+
ctx context.Context, nodes option.NodeListOption, stallDuration, unstallDuration time.Duration,
195+
) {
196+
l := newDiskStallLogger(s.f.L(), nodes, "Stall")
197+
if err := s.Failer.Inject(ctx, l, failures.DiskStallArgs{
198+
Nodes: nodes.InstallNodes(),
199+
Cycle: true,
200+
CycleStallDuration: stallDuration,
201+
CycleUnstallDuration: unstallDuration,
202+
}); err != nil {
203+
s.f.Fatalf("failed to stall disk: %s", err)
204+
}
205+
}
206+
171207
func (s *dmsetupDiskStaller) Slow(
172208
ctx context.Context, nodes option.NodeListOption, bytesPerSecond int,
173209
) {

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,11 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
462462
// Use CgroupDiskStaller with readsToo=false to only stall writes.
463463
s := roachtestutil.MakeCgroupDiskStaller(t, c, false /* readsToo */, false /* logsToo */)
464464
s.Setup(ctx)
465-
defer s.Cleanup(ctx)
465+
// NB: We use a background context in the defer'ed cleanup command,
466+
// otherwise on test failure our c.Run calls will be ignored. Leaving
467+
// the disk stalled will prevent artifact collection, making debugging
468+
// difficult.
469+
defer s.Cleanup(context.Background())
466470

467471
t.Status("starting cluster")
468472
startOpts := option.DefaultStartOpts()
@@ -612,36 +616,15 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
612616
case <-time.After(diskStallWaitDur):
613617
t.Status("starting disk stall")
614618
}
615-
stallStart := timeutil.Now()
616-
// Execute short 200ms stalls every 5s.
617-
for timeutil.Since(stallStart) < operationDur {
618-
select {
619-
case <-ctx.Done():
620-
t.Fatalf("context done while stall induced: %s", ctx.Err())
621-
case <-time.After(stallInterval):
622-
func() {
623-
t.Status("short disk stall on n1")
624-
s.Stall(ctx, c.Node(1))
625-
defer func() {
626-
// NB: We use a background context in the defer'ed unstall command,
627-
// otherwise on test failure our Unstall calls will be ignored. Leaving
628-
// the disk stalled will prevent artifact collection, making debugging
629-
// difficult.
630-
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
631-
defer cancel()
632-
s.Unstall(ctx, c.Node(1))
633-
t.Status("unstalled disk on n1")
634-
}()
635-
select {
636-
case <-ctx.Done():
637-
t.Fatalf("context done while stall induced: %s", ctx.Err())
638-
case <-time.After(shortStallDur):
639-
return
640-
}
641-
}()
642-
}
619+
// Execute short 200ms stalls every 5s for 3 minutes.
620+
s.StallCycle(ctx, c.Node(1), shortStallDur, stallInterval)
621+
select {
622+
case <-ctx.Done():
623+
t.Fatalf("context done while stall induced: %s", ctx.Err())
624+
case <-time.After(operationDur):
625+
s.Unstall(ctx, c.Node(1))
626+
t.Status("disk stalls stopped")
643627
}
644-
645628
return nil
646629
}, task.Name("disk-stall-phase"))
647630
} else {

0 commit comments

Comments
 (0)