@@ -7,18 +7,17 @@ package roachtestutil
7
7
8
8
import (
9
9
"context"
10
- "fmt"
11
- "math/rand"
12
- "path/filepath"
13
- "strconv"
14
- "strings"
15
10
16
11
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
17
12
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
18
13
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
14
+ "github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
19
15
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
20
16
)
21
17
18
+ // TODO(darryl): Once the failure injection library is a first class citizen of roachtest,
19
+ // i.e. synced with the monitor, test + cluster spec validation, observability into failure
20
+ // modes, etc. we can remove this interface entirely.
22
21
type DiskStaller interface {
23
22
Setup (ctx context.Context )
24
23
Cleanup (ctx context.Context )
@@ -48,20 +47,23 @@ type Fataler interface {
48
47
}
49
48
50
49
type cgroupDiskStaller struct {
51
- f Fataler
52
- c cluster.Cluster
53
- readOrWrite []bandwidthReadWrite
54
- logsToo bool
50
+ * failures.Failer
51
+ f Fataler
52
+ c cluster.Cluster
53
+ stallReads bool
54
+ stallLogs bool
55
55
}
56
56
57
57
var _ DiskStaller = (* cgroupDiskStaller )(nil )
58
58
59
- func MakeCgroupDiskStaller (f Fataler , c cluster.Cluster , readsToo bool , logsToo bool ) DiskStaller {
60
- bwRW := []bandwidthReadWrite {writeBandwidth }
61
- if readsToo {
62
- bwRW = append (bwRW , readBandwidth )
59
+ func MakeCgroupDiskStaller (
60
+ f Fataler , c cluster.Cluster , stallReads bool , stallLogs bool ,
61
+ ) DiskStaller {
62
+ diskStaller , err := c .GetFailer (f .L (), c .CRDBNodes (), failures .CgroupsDiskStallName )
63
+ if err != nil {
64
+ f .Fatalf ("failed to get failer: %s" , err )
63
65
}
64
- return & cgroupDiskStaller {f : f , c : c , readOrWrite : bwRW , logsToo : logsToo }
66
+ return & cgroupDiskStaller {Failer : diskStaller , f : f , c : c , stallReads : stallReads , stallLogs : stallLogs }
65
67
}
66
68
67
69
func (s * cgroupDiskStaller ) DataDir () string { return "{store-dir}" }
@@ -73,174 +75,89 @@ func (s *cgroupDiskStaller) Setup(ctx context.Context) {
73
75
// Safety measure.
74
76
s .f .Fatalf ("cluster needs ReusePolicyNone to support disk stalls" )
75
77
}
76
- if s .logsToo {
77
- s .c .Run (ctx , option .WithNodes (s .c .All ()), "mkdir -p {store-dir}/logs" )
78
- s .c .Run (ctx , option .WithNodes (s .c .All ()), "rm -f logs && ln -s {store-dir}/logs logs || true" )
78
+ if err := s .Failer .Setup (ctx , s .f .L (), failures.DiskStallArgs {
79
+ StallLogs : s .stallLogs ,
80
+ Nodes : s .c .CRDBNodes ().InstallNodes (),
81
+ }); err != nil {
82
+ s .f .Fatalf ("failed to setup disk stall: %s" , err )
83
+ }
84
+ }
85
+ func (s * cgroupDiskStaller ) Cleanup (ctx context.Context ) {
86
+ err := s .Failer .Cleanup (ctx , s .f .L ())
87
+ if err != nil {
88
+ s .f .Fatalf ("failed to cleanup disk stall: %s" , err )
79
89
}
80
90
}
81
- func (s * cgroupDiskStaller ) Cleanup (ctx context.Context ) {}
82
91
83
92
func (s * cgroupDiskStaller ) Stall (ctx context.Context , nodes option.NodeListOption ) {
84
- // NB: I don't understand why, but attempting to set a bytesPerSecond={0,1}
85
- // results in Invalid argument from the io.max cgroupv2 API.
86
- s .Slow (ctx , nodes , 4 )
93
+ if err := s .Failer .Inject (ctx , s .f .L (), failures.DiskStallArgs {
94
+ StallLogs : s .stallLogs ,
95
+ StallWrites : true ,
96
+ StallReads : s .stallReads ,
97
+ Nodes : nodes .InstallNodes (),
98
+ }); err != nil {
99
+ s .f .Fatalf ("failed to stall disk: %s" , err )
100
+ }
87
101
}
88
102
89
103
func (s * cgroupDiskStaller ) Slow (
90
104
ctx context.Context , nodes option.NodeListOption , bytesPerSecond int ,
91
105
) {
92
- // Shuffle the order of read and write stall initiation.
93
- rand . Shuffle ( len ( s . readOrWrite ), func ( i , j int ) {
94
- s . readOrWrite [ i ], s . readOrWrite [ j ] = s . readOrWrite [ j ], s . readOrWrite [ i ]
95
- })
96
- for _ , rw := range s . readOrWrite {
97
- if err := s . setThroughput ( ctx , nodes , rw , throughput { limited : true , bytesPerSecond : bytesPerSecond }); err != nil {
98
- s . f . Fatal ( err )
99
- }
106
+ if err := s . Failer . Inject ( ctx , s . f . L (), failures. DiskStallArgs {
107
+ StallLogs : s . stallLogs ,
108
+ StallWrites : true ,
109
+ StallReads : s . stallReads ,
110
+ Nodes : nodes . InstallNodes (),
111
+ Throughput : bytesPerSecond ,
112
+ }); err != nil {
113
+ s . f . Fatalf ( "failed to slow disk: %s" , err )
100
114
}
101
115
}
102
116
103
117
func (s * cgroupDiskStaller ) Unstall (ctx context.Context , nodes option.NodeListOption ) {
104
- for _ , rw := range s .readOrWrite {
105
- err := s .setThroughput (ctx , nodes , rw , throughput {limited : false })
106
- if err != nil {
107
- s .f .L ().PrintfCtx (ctx , "error unstalling the disk; stumbling on: %v" , err )
108
- }
109
- // NB: We log the error and continue on because unstalling may not
110
- // succeed if the process has successfully exited.
118
+ if err := s .Failer .Recover (ctx , s .f .L ()); err != nil {
119
+ s .f .Fatalf ("failed to unstall disk: %s" , err )
111
120
}
112
121
}
113
122
114
- func (s * cgroupDiskStaller ) device (nodes option.NodeListOption ) (major , minor int ) {
115
- // TODO(jackson): Programmatically determine the device major,minor numbers.
116
- // eg,:
117
- // deviceName := getDevice(s.t, s.c)
118
- // `cat /proc/partitions` and find `deviceName`
119
- res , err := s .c .RunWithDetailsSingleNode (context .TODO (), s .f .L (), option .WithNodes (nodes [:1 ]), "lsblk | grep /mnt/data1 | awk '{print $2}'" )
120
- if err != nil {
121
- s .f .Fatalf ("error when determining block device: %s" , err )
122
- return 0 , 0
123
- }
124
- parts := strings .Split (strings .TrimSpace (res .Stdout ), ":" )
125
- if len (parts ) != 2 {
126
- s .f .Fatalf ("unexpected output from lsblk: %s" , res .Stdout )
127
- return 0 , 0
128
- }
129
- major , err = strconv .Atoi (parts [0 ])
130
- if err != nil {
131
- s .f .Fatalf ("error when determining block device: %s" , err )
132
- return 0 , 0
133
- }
134
- minor , err = strconv .Atoi (parts [1 ])
135
- if err != nil {
136
- s .f .Fatalf ("error when determining block device: %s" , err )
137
- return 0 , 0
138
- }
139
- return major , minor
140
- }
141
-
142
- type throughput struct {
143
- limited bool
144
- bytesPerSecond int
145
- }
146
-
147
- type bandwidthReadWrite int8
148
-
149
- const (
150
- readBandwidth bandwidthReadWrite = iota
151
- writeBandwidth
152
- )
153
-
154
- func (rw bandwidthReadWrite ) cgroupV2BandwidthProp () string {
155
- switch rw {
156
- case readBandwidth :
157
- return "rbps"
158
- case writeBandwidth :
159
- return "wbps"
160
- default :
161
- panic ("unreachable" )
162
- }
163
- }
164
-
165
- func (s * cgroupDiskStaller ) setThroughput (
166
- ctx context.Context , nodes option.NodeListOption , rw bandwidthReadWrite , bw throughput ,
167
- ) error {
168
- maj , min := s .device (nodes )
169
- cockroachIOController := filepath .Join ("/sys/fs/cgroup/system.slice" , SystemInterfaceSystemdUnitName ()+ ".service" , "io.max" )
170
-
171
- bytesPerSecondStr := "max"
172
- if bw .limited {
173
- bytesPerSecondStr = fmt .Sprintf ("%d" , bw .bytesPerSecond )
174
- }
175
- return s .c .RunE (ctx , option .WithNodes (nodes ), "sudo" , "/bin/bash" , "-c" , fmt .Sprintf (
176
- `'echo %d:%d %s=%s > %s'` ,
177
- maj ,
178
- min ,
179
- rw .cgroupV2BandwidthProp (),
180
- bytesPerSecondStr ,
181
- cockroachIOController ,
182
- ))
183
- }
184
-
185
- func GetDiskDevice (f Fataler , c cluster.Cluster , nodes option.NodeListOption ) string {
186
- res , err := c .RunWithDetailsSingleNode (context .TODO (), f .L (), option .WithNodes (nodes [:1 ]), "lsblk | grep /mnt/data1 | awk '{print $1}'" )
187
- if err != nil {
188
- f .Fatalf ("error when determining block device: %s" , err )
189
- return ""
190
- }
191
- return "/dev/" + strings .TrimSpace (res .Stdout )
192
- }
193
-
194
123
type dmsetupDiskStaller struct {
124
+ * failures.Failer
195
125
f Fataler
196
126
c cluster.Cluster
197
-
198
- dev string // set in Setup; s.device() doesn't work when volume is not set up
199
127
}
200
128
201
129
var _ DiskStaller = (* dmsetupDiskStaller )(nil )
202
130
203
- func (s * dmsetupDiskStaller ) device (nodes option.NodeListOption ) string {
204
- return GetDiskDevice (s .f , s .c , nodes )
131
+ func MakeDmsetupDiskStaller (f Fataler , c cluster.Cluster ) DiskStaller {
132
+ diskStaller , err := c .GetFailer (f .L (), c .CRDBNodes (), failures .DmsetupDiskStallName )
133
+ if err != nil {
134
+ f .Fatalf ("failed to get failer: %s" , err )
135
+ }
136
+ return & dmsetupDiskStaller {Failer : diskStaller , f : f , c : c }
205
137
}
206
138
207
139
func (s * dmsetupDiskStaller ) Setup (ctx context.Context ) {
208
140
if _ , ok := s .c .Spec ().ReusePolicy .(spec.ReusePolicyNone ); ! ok {
209
141
// We disable journaling and do all kinds of things below.
210
142
s .f .Fatalf ("cluster needs ReusePolicyNone to support disk stalls" )
211
143
}
212
- s .dev = s .device (s .c .All ())
213
- // snapd will run "snapd auto-import /dev/dm-0" via udev triggers when
214
- // /dev/dm-0 is created. This possibly interferes with the dmsetup create
215
- // reload, so uninstall snapd.
216
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo apt-get purge -y snapd` )
217
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo umount -f /mnt/data1 || true` )
218
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo dmsetup remove_all` )
219
- // See https://github.com/cockroachdb/cockroach/issues/129619#issuecomment-2316147244.
220
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo tune2fs -O ^has_journal ` + s .dev )
221
- err := s .c .RunE (ctx , option .WithNodes (s .c .All ()), `echo "0 $(sudo blockdev --getsz ` + s .dev + `) linear ` + s .dev + ` 0" | ` +
222
- `sudo dmsetup create data1` )
223
- if err != nil {
224
- // This has occasionally been seen to fail with "Device or resource busy",
225
- // with no clear explanation. Try to find out who it is.
226
- s .c .Run (ctx , option .WithNodes (s .c .All ()), "sudo bash -c 'ps aux; dmsetup status; mount; lsof'" )
227
- s .f .Fatal (err )
144
+ if err := s .Failer .Setup (ctx , s .f .L (), failures.DiskStallArgs {Nodes : s .c .CRDBNodes ().InstallNodes ()}); err != nil {
145
+ s .f .Fatalf ("failed to setup disk stall: %s" , err )
228
146
}
229
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo mount /dev/mapper/data1 /mnt/data1` )
230
147
}
231
148
232
149
func (s * dmsetupDiskStaller ) Cleanup (ctx context.Context ) {
233
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo dmsetup resume data1` )
234
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo umount /mnt/data1` )
235
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo dmsetup remove_all` )
236
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo tune2fs -O has_journal ` + s .dev )
237
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo mount /mnt/data1` )
238
- // Reinstall snapd in case subsequent tests need it.
239
- s .c .Run (ctx , option .WithNodes (s .c .All ()), `sudo apt-get install -y snapd` )
150
+ if err := s .Failer .Cleanup (ctx , s .f .L ()); err != nil {
151
+ s .f .Fatalf ("failed to cleanup disk stall: %s" , err )
152
+ }
240
153
}
241
154
242
155
func (s * dmsetupDiskStaller ) Stall (ctx context.Context , nodes option.NodeListOption ) {
243
- s .c .Run (ctx , option .WithNodes (nodes ), `sudo dmsetup suspend --noflush --nolockfs data1` )
156
+ if err := s .Failer .Inject (ctx , s .f .L (), failures.DiskStallArgs {
157
+ Nodes : nodes .InstallNodes (),
158
+ }); err != nil {
159
+ s .f .Fatalf ("failed to stall disk: %s" , err )
160
+ }
244
161
}
245
162
246
163
func (s * dmsetupDiskStaller ) Slow (
@@ -251,12 +168,10 @@ func (s *dmsetupDiskStaller) Slow(
251
168
}
252
169
253
170
func (s * dmsetupDiskStaller ) Unstall (ctx context.Context , nodes option.NodeListOption ) {
254
- s .c .Run (ctx , option .WithNodes (nodes ), `sudo dmsetup resume data1` )
171
+ if err := s .Failer .Recover (ctx , s .f .L ()); err != nil {
172
+ s .f .Fatalf ("failed to unstall disk: %s" , err )
173
+ }
255
174
}
256
175
257
176
func (s * dmsetupDiskStaller ) DataDir () string { return "{store-dir}" }
258
177
func (s * dmsetupDiskStaller ) LogDir () string { return "logs" }
259
-
260
- func MakeDmsetupDiskStaller (f Fataler , c cluster.Cluster ) DiskStaller {
261
- return & dmsetupDiskStaller {f : f , c : c }
262
- }
0 commit comments