Skip to content

Commit 3091388

Browse files
committed
roachtest: use failure injection library for disk stalls
This change switches the roachtestutil disk staller to using the failureinjection library instead. We keep the roachtestutil disk staller interface for now, as it still provides some additional cluster spec validation, but will be replaced completely in the future.
1 parent 2d01e6d commit 3091388

File tree

19 files changed

+191
-203
lines changed

19 files changed

+191
-203
lines changed

pkg/cmd/roachtest/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ go_library(
4848
"//pkg/roachprod/cloud",
4949
"//pkg/roachprod/config",
5050
"//pkg/roachprod/errors",
51+
"//pkg/roachprod/failureinjection/failures",
5152
"//pkg/roachprod/install",
5253
"//pkg/roachprod/logger",
5354
"//pkg/roachprod/prometheus",

pkg/cmd/roachtest/cluster.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"github.com/cockroachdb/cockroach/pkg/roachprod"
4343
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
4444
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
45+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
4546
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
4647
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
4748
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
@@ -3311,3 +3312,20 @@ func (c *clusterImpl) RegisterClusterHook(
33113312
panic(fmt.Sprintf("unknown test hook type %v", hookType))
33123313
}
33133314
}
3315+
3316+
// GetFailer returns a *failures.Failer for the given failure mode name. Used
3317+
// for conducting failure injection on a cluster.
3318+
func (c *clusterImpl) GetFailer(
3319+
l *logger.Logger,
3320+
nodes option.NodeListOption,
3321+
failureModeName string,
3322+
opts ...failures.ClusterOptionFunc,
3323+
) (*failures.Failer, error) {
3324+
fr := failures.GetFailureRegistry()
3325+
clusterOpts := append(opts, failures.Secure(c.IsSecure()))
3326+
failer, err := fr.GetFailer(c.MakeNodes(nodes), failureModeName, l, clusterOpts...)
3327+
if err != nil {
3328+
return nil, err
3329+
}
3330+
return failer, err
3331+
}

pkg/cmd/roachtest/cluster/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ go_library(
1515
"//pkg/cmd/roachtest/option",
1616
"//pkg/cmd/roachtest/spec",
1717
"//pkg/roachprod",
18+
"//pkg/roachprod/failureinjection/failures",
1819
"//pkg/roachprod/install",
1920
"//pkg/roachprod/logger",
2021
"//pkg/roachprod/prometheus",

pkg/cmd/roachtest/cluster/cluster_interface.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1717
"github.com/cockroachdb/cockroach/pkg/roachprod"
18+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
1819
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1920
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2021
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
@@ -215,4 +216,6 @@ type Cluster interface {
215216
GetPreemptedVMs(ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error)
216217

217218
RegisterClusterHook(hookName string, hookType option.ClusterHookType, timeout time.Duration, hook func(context.Context) error)
219+
220+
GetFailer(l *logger.Logger, nodes option.NodeListOption, failureModeName string, opts ...failures.ClusterOptionFunc) (*failures.Failer, error)
218221
}

pkg/cmd/roachtest/clusterstats/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ go_library(
2121
# Required for generated mocks
2222
"//pkg/roachprod", #keep
2323
# Required for generated mocks
24+
"//pkg/roachprod/failureinjection/failures", #keep
2425
"//pkg/roachprod/install", #keep
2526
"//pkg/roachprod/logger",
2627
"//pkg/roachprod/prometheus",

pkg/cmd/roachtest/clusterstats/mock_cluster_generated_test.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cmd/roachtest/roachtestutil/disk_stall.go

Lines changed: 65 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,17 @@ package roachtestutil
77

88
import (
99
"context"
10-
"fmt"
11-
"math/rand"
12-
"path/filepath"
13-
"strconv"
14-
"strings"
1510

1611
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1712
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1813
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
14+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
1915
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2016
)
2117

18+
// TODO(darryl): Once the failure injection library is a first class citizen of roachtest,
19+
// i.e. synced with the monitor, test + cluster spec validation, observability into failure
20+
// modes, etc. we can remove this interface entirely.
2221
type DiskStaller interface {
2322
Setup(ctx context.Context)
2423
Cleanup(ctx context.Context)
@@ -48,20 +47,23 @@ type Fataler interface {
4847
}
4948

5049
type cgroupDiskStaller struct {
51-
f Fataler
52-
c cluster.Cluster
53-
readOrWrite []bandwidthReadWrite
54-
logsToo bool
50+
*failures.Failer
51+
f Fataler
52+
c cluster.Cluster
53+
stallReads bool
54+
stallLogs bool
5555
}
5656

5757
var _ DiskStaller = (*cgroupDiskStaller)(nil)
5858

59-
func MakeCgroupDiskStaller(f Fataler, c cluster.Cluster, readsToo bool, logsToo bool) DiskStaller {
60-
bwRW := []bandwidthReadWrite{writeBandwidth}
61-
if readsToo {
62-
bwRW = append(bwRW, readBandwidth)
59+
func MakeCgroupDiskStaller(
60+
f Fataler, c cluster.Cluster, stallReads bool, stallLogs bool,
61+
) DiskStaller {
62+
diskStaller, err := c.GetFailer(f.L(), c.CRDBNodes(), failures.CgroupsDiskStallName)
63+
if err != nil {
64+
f.Fatalf("failed to get failer: %s", err)
6365
}
64-
return &cgroupDiskStaller{f: f, c: c, readOrWrite: bwRW, logsToo: logsToo}
66+
return &cgroupDiskStaller{Failer: diskStaller, f: f, c: c, stallReads: stallReads, stallLogs: stallLogs}
6567
}
6668

6769
func (s *cgroupDiskStaller) DataDir() string { return "{store-dir}" }
@@ -73,174 +75,89 @@ func (s *cgroupDiskStaller) Setup(ctx context.Context) {
7375
// Safety measure.
7476
s.f.Fatalf("cluster needs ReusePolicyNone to support disk stalls")
7577
}
76-
if s.logsToo {
77-
s.c.Run(ctx, option.WithNodes(s.c.All()), "mkdir -p {store-dir}/logs")
78-
s.c.Run(ctx, option.WithNodes(s.c.All()), "rm -f logs && ln -s {store-dir}/logs logs || true")
78+
if err := s.Failer.Setup(ctx, s.f.L(), failures.DiskStallArgs{
79+
StallLogs: s.stallLogs,
80+
Nodes: s.c.CRDBNodes().InstallNodes(),
81+
}); err != nil {
82+
s.f.Fatalf("failed to setup disk stall: %s", err)
83+
}
84+
}
85+
func (s *cgroupDiskStaller) Cleanup(ctx context.Context) {
86+
err := s.Failer.Cleanup(ctx, s.f.L())
87+
if err != nil {
88+
s.f.Fatalf("failed to cleanup disk stall: %s", err)
7989
}
8090
}
81-
func (s *cgroupDiskStaller) Cleanup(ctx context.Context) {}
8291

8392
func (s *cgroupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOption) {
84-
// NB: I don't understand why, but attempting to set a bytesPerSecond={0,1}
85-
// results in Invalid argument from the io.max cgroupv2 API.
86-
s.Slow(ctx, nodes, 4)
93+
if err := s.Failer.Inject(ctx, s.f.L(), failures.DiskStallArgs{
94+
StallLogs: s.stallLogs,
95+
StallWrites: true,
96+
StallReads: s.stallReads,
97+
Nodes: nodes.InstallNodes(),
98+
}); err != nil {
99+
s.f.Fatalf("failed to stall disk: %s", err)
100+
}
87101
}
88102

89103
func (s *cgroupDiskStaller) Slow(
90104
ctx context.Context, nodes option.NodeListOption, bytesPerSecond int,
91105
) {
92-
// Shuffle the order of read and write stall initiation.
93-
rand.Shuffle(len(s.readOrWrite), func(i, j int) {
94-
s.readOrWrite[i], s.readOrWrite[j] = s.readOrWrite[j], s.readOrWrite[i]
95-
})
96-
for _, rw := range s.readOrWrite {
97-
if err := s.setThroughput(ctx, nodes, rw, throughput{limited: true, bytesPerSecond: bytesPerSecond}); err != nil {
98-
s.f.Fatal(err)
99-
}
106+
if err := s.Failer.Inject(ctx, s.f.L(), failures.DiskStallArgs{
107+
StallLogs: s.stallLogs,
108+
StallWrites: true,
109+
StallReads: s.stallReads,
110+
Nodes: nodes.InstallNodes(),
111+
Throughput: bytesPerSecond,
112+
}); err != nil {
113+
s.f.Fatalf("failed to slow disk: %s", err)
100114
}
101115
}
102116

103117
func (s *cgroupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
104-
for _, rw := range s.readOrWrite {
105-
err := s.setThroughput(ctx, nodes, rw, throughput{limited: false})
106-
if err != nil {
107-
s.f.L().PrintfCtx(ctx, "error unstalling the disk; stumbling on: %v", err)
108-
}
109-
// NB: We log the error and continue on because unstalling may not
110-
// succeed if the process has successfully exited.
118+
if err := s.Failer.Recover(ctx, s.f.L()); err != nil {
119+
s.f.Fatalf("failed to unstall disk: %s", err)
111120
}
112121
}
113122

114-
func (s *cgroupDiskStaller) device(nodes option.NodeListOption) (major, minor int) {
115-
// TODO(jackson): Programmatically determine the device major,minor numbers.
116-
// eg,:
117-
// deviceName := getDevice(s.t, s.c)
118-
// `cat /proc/partitions` and find `deviceName`
119-
res, err := s.c.RunWithDetailsSingleNode(context.TODO(), s.f.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $2}'")
120-
if err != nil {
121-
s.f.Fatalf("error when determining block device: %s", err)
122-
return 0, 0
123-
}
124-
parts := strings.Split(strings.TrimSpace(res.Stdout), ":")
125-
if len(parts) != 2 {
126-
s.f.Fatalf("unexpected output from lsblk: %s", res.Stdout)
127-
return 0, 0
128-
}
129-
major, err = strconv.Atoi(parts[0])
130-
if err != nil {
131-
s.f.Fatalf("error when determining block device: %s", err)
132-
return 0, 0
133-
}
134-
minor, err = strconv.Atoi(parts[1])
135-
if err != nil {
136-
s.f.Fatalf("error when determining block device: %s", err)
137-
return 0, 0
138-
}
139-
return major, minor
140-
}
141-
142-
type throughput struct {
143-
limited bool
144-
bytesPerSecond int
145-
}
146-
147-
type bandwidthReadWrite int8
148-
149-
const (
150-
readBandwidth bandwidthReadWrite = iota
151-
writeBandwidth
152-
)
153-
154-
func (rw bandwidthReadWrite) cgroupV2BandwidthProp() string {
155-
switch rw {
156-
case readBandwidth:
157-
return "rbps"
158-
case writeBandwidth:
159-
return "wbps"
160-
default:
161-
panic("unreachable")
162-
}
163-
}
164-
165-
func (s *cgroupDiskStaller) setThroughput(
166-
ctx context.Context, nodes option.NodeListOption, rw bandwidthReadWrite, bw throughput,
167-
) error {
168-
maj, min := s.device(nodes)
169-
cockroachIOController := filepath.Join("/sys/fs/cgroup/system.slice", SystemInterfaceSystemdUnitName()+".service", "io.max")
170-
171-
bytesPerSecondStr := "max"
172-
if bw.limited {
173-
bytesPerSecondStr = fmt.Sprintf("%d", bw.bytesPerSecond)
174-
}
175-
return s.c.RunE(ctx, option.WithNodes(nodes), "sudo", "/bin/bash", "-c", fmt.Sprintf(
176-
`'echo %d:%d %s=%s > %s'`,
177-
maj,
178-
min,
179-
rw.cgroupV2BandwidthProp(),
180-
bytesPerSecondStr,
181-
cockroachIOController,
182-
))
183-
}
184-
185-
func GetDiskDevice(f Fataler, c cluster.Cluster, nodes option.NodeListOption) string {
186-
res, err := c.RunWithDetailsSingleNode(context.TODO(), f.L(), option.WithNodes(nodes[:1]), "lsblk | grep /mnt/data1 | awk '{print $1}'")
187-
if err != nil {
188-
f.Fatalf("error when determining block device: %s", err)
189-
return ""
190-
}
191-
return "/dev/" + strings.TrimSpace(res.Stdout)
192-
}
193-
194123
type dmsetupDiskStaller struct {
124+
*failures.Failer
195125
f Fataler
196126
c cluster.Cluster
197-
198-
dev string // set in Setup; s.device() doesn't work when volume is not set up
199127
}
200128

201129
var _ DiskStaller = (*dmsetupDiskStaller)(nil)
202130

203-
func (s *dmsetupDiskStaller) device(nodes option.NodeListOption) string {
204-
return GetDiskDevice(s.f, s.c, nodes)
131+
func MakeDmsetupDiskStaller(f Fataler, c cluster.Cluster) DiskStaller {
132+
diskStaller, err := c.GetFailer(f.L(), c.CRDBNodes(), failures.DmsetupDiskStallName)
133+
if err != nil {
134+
f.Fatalf("failed to get failer: %s", err)
135+
}
136+
return &dmsetupDiskStaller{Failer: diskStaller, f: f, c: c}
205137
}
206138

207139
func (s *dmsetupDiskStaller) Setup(ctx context.Context) {
208140
if _, ok := s.c.Spec().ReusePolicy.(spec.ReusePolicyNone); !ok {
209141
// We disable journaling and do all kinds of things below.
210142
s.f.Fatalf("cluster needs ReusePolicyNone to support disk stalls")
211143
}
212-
s.dev = s.device(s.c.All())
213-
// snapd will run "snapd auto-import /dev/dm-0" via udev triggers when
214-
// /dev/dm-0 is created. This possibly interferes with the dmsetup create
215-
// reload, so uninstall snapd.
216-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo apt-get purge -y snapd`)
217-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo umount -f /mnt/data1 || true`)
218-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo dmsetup remove_all`)
219-
// See https://github.com/cockroachdb/cockroach/issues/129619#issuecomment-2316147244.
220-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo tune2fs -O ^has_journal `+s.dev)
221-
err := s.c.RunE(ctx, option.WithNodes(s.c.All()), `echo "0 $(sudo blockdev --getsz `+s.dev+`) linear `+s.dev+` 0" | `+
222-
`sudo dmsetup create data1`)
223-
if err != nil {
224-
// This has occasionally been seen to fail with "Device or resource busy",
225-
// with no clear explanation. Try to find out who it is.
226-
s.c.Run(ctx, option.WithNodes(s.c.All()), "sudo bash -c 'ps aux; dmsetup status; mount; lsof'")
227-
s.f.Fatal(err)
144+
if err := s.Failer.Setup(ctx, s.f.L(), failures.DiskStallArgs{Nodes: s.c.CRDBNodes().InstallNodes()}); err != nil {
145+
s.f.Fatalf("failed to setup disk stall: %s", err)
228146
}
229-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo mount /dev/mapper/data1 /mnt/data1`)
230147
}
231148

232149
func (s *dmsetupDiskStaller) Cleanup(ctx context.Context) {
233-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo dmsetup resume data1`)
234-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo umount /mnt/data1`)
235-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo dmsetup remove_all`)
236-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo tune2fs -O has_journal `+s.dev)
237-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo mount /mnt/data1`)
238-
// Reinstall snapd in case subsequent tests need it.
239-
s.c.Run(ctx, option.WithNodes(s.c.All()), `sudo apt-get install -y snapd`)
150+
if err := s.Failer.Cleanup(ctx, s.f.L()); err != nil {
151+
s.f.Fatalf("failed to cleanup disk stall: %s", err)
152+
}
240153
}
241154

242155
func (s *dmsetupDiskStaller) Stall(ctx context.Context, nodes option.NodeListOption) {
243-
s.c.Run(ctx, option.WithNodes(nodes), `sudo dmsetup suspend --noflush --nolockfs data1`)
156+
if err := s.Failer.Inject(ctx, s.f.L(), failures.DiskStallArgs{
157+
Nodes: nodes.InstallNodes(),
158+
}); err != nil {
159+
s.f.Fatalf("failed to stall disk: %s", err)
160+
}
244161
}
245162

246163
func (s *dmsetupDiskStaller) Slow(
@@ -251,12 +168,10 @@ func (s *dmsetupDiskStaller) Slow(
251168
}
252169

253170
func (s *dmsetupDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
254-
s.c.Run(ctx, option.WithNodes(nodes), `sudo dmsetup resume data1`)
171+
if err := s.Failer.Recover(ctx, s.f.L()); err != nil {
172+
s.f.Fatalf("failed to unstall disk: %s", err)
173+
}
255174
}
256175

257176
func (s *dmsetupDiskStaller) DataDir() string { return "{store-dir}" }
258177
func (s *dmsetupDiskStaller) LogDir() string { return "logs" }
259-
260-
func MakeDmsetupDiskStaller(f Fataler, c cluster.Cluster) DiskStaller {
261-
return &dmsetupDiskStaller{f: f, c: c}
262-
}

0 commit comments

Comments
 (0)