@@ -53,6 +53,9 @@ var _ = BeforeSuite(func() {
53
53
factory .GetClusterOptions ()... ,
54
54
)
55
55
56
+ // Make sure the unschedulable Pod is not removed
57
+ Expect (fdbCluster .SetAutoReplacements (false , 12 * time .Hour )).NotTo (HaveOccurred ())
58
+
56
59
// Load some data into the cluster.
57
60
factory .CreateDataLoaderIfAbsent (fdbCluster )
58
61
})
@@ -66,6 +69,7 @@ var _ = AfterSuite(func() {
66
69
67
70
var _ = Describe ("Operator maintenance mode tests" , Label ("e2e" ), func () {
68
71
AfterEach (func () {
72
+ Expect (fdbCluster .ClearBuggifyNoSchedule (false )).NotTo (HaveOccurred ())
69
73
if CurrentSpecReport ().Failed () {
70
74
factory .DumpState (fdbCluster )
71
75
}
@@ -159,5 +163,155 @@ var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
159
163
log .Println ("It took:" , time .Since (startTime ).String (), "to detected the failure" )
160
164
})
161
165
})
166
+
167
+ When ("there is a failure during maintenance mode" , func () {
168
+ BeforeEach (func () {
169
+ // Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
170
+ // but actually is being reset.
171
+ fdbCluster .RunFdbCliCommandInOperator (fmt .Sprintf ("maintenance on %s 7200" , faultDomain ), false , 60 )
172
+ })
173
+
174
+ AfterEach (func () {
175
+ fdbCluster .SetCrashLoopContainers (nil , false )
176
+ })
177
+
178
+ It ("should remove the maintenance mode" , func () {
179
+ // Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
180
+ Consistently (func (g Gomega ) fdbv1beta2.FaultDomain {
181
+ status := fdbCluster .GetStatus ()
182
+
183
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
184
+ g .Expect (tracker .State .Healthy ).To (BeTrue ())
185
+ }
186
+
187
+ return status .Cluster .MaintenanceZone
188
+ }).WithTimeout (2 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (faultDomain ))
189
+
190
+ log .Println ("When another storage Pod is failing" )
191
+ var podToRecreate corev1.Pod
192
+
193
+ for _ , pod := range fdbCluster .GetStoragePods ().Items {
194
+ if pod .Name == failingStoragePod .Name {
195
+ continue
196
+ }
197
+
198
+ podToRecreate = pod
199
+ break
200
+ }
201
+
202
+ // We delete a Pod and make it crash-looping, to make sure the process is failed for more than 60 seconds.
203
+ log .Println ("Delete Pod" , podToRecreate .Name )
204
+ fdbCluster .SetCrashLoopContainers ([]fdbv1beta2.CrashLoopContainerObject {
205
+ {
206
+ ContainerName : fdbv1beta2 .MainContainerName ,
207
+ Targets : []fdbv1beta2.ProcessGroupID {fixtures .GetProcessGroupID (podToRecreate )},
208
+ },
209
+ }, false )
210
+ factory .DeletePod (& podToRecreate )
211
+
212
+ log .Println ("Wait until maintenance mode is reset" )
213
+ startTime := time .Now ()
214
+ // We would expect that the team tracker gets unhealthy once the maintenance mode is timed out.
215
+ Eventually (func (g Gomega ) fdbv1beta2.FaultDomain {
216
+ status := fdbCluster .GetStatus ()
217
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
218
+ g .Expect (tracker .State .Healthy ).To (BeFalse ())
219
+ }
220
+
221
+ return status .Cluster .MaintenanceZone
222
+ }).WithTimeout (10 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (fdbv1beta2 .FaultDomain ("" )))
223
+
224
+ log .Println ("It took:" , time .Since (startTime ).String (), "to detected the failure" )
225
+ })
226
+ })
227
+
228
+ When ("there is additional load on the cluster" , func () {
229
+ BeforeEach (func () {
230
+ // Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
231
+ // but actually is being reset.
232
+ fdbCluster .RunFdbCliCommandInOperator (fmt .Sprintf ("maintenance on %s 7200" , faultDomain ), false , 60 )
233
+ })
234
+
235
+ AfterEach (func () {
236
+ Consistently (func (g Gomega ) fdbv1beta2.FaultDomain {
237
+ status := fdbCluster .GetStatus ()
238
+
239
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
240
+ g .Expect (tracker .State .Healthy ).To (BeTrue ())
241
+ }
242
+
243
+ return status .Cluster .MaintenanceZone
244
+ }).WithTimeout (2 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (faultDomain ))
245
+ })
246
+
247
+ It ("should not remove the maintenance mode" , func () {
248
+ // Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
249
+ Consistently (func (g Gomega ) fdbv1beta2.FaultDomain {
250
+ status := fdbCluster .GetStatus ()
251
+
252
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
253
+ g .Expect (tracker .State .Healthy ).To (BeTrue ())
254
+ }
255
+
256
+ return status .Cluster .MaintenanceZone
257
+ }).WithTimeout (2 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (faultDomain ))
258
+
259
+ log .Println ("When loading additional data into the cluster" )
260
+ factory .CreateDataLoaderIfAbsent (fdbCluster )
261
+ factory .CreateDataLoaderIfAbsent (fdbCluster )
262
+ })
263
+ })
264
+
265
+ When ("the number of storage Pods is changed" , func () {
266
+ var initialStoragePods int
267
+
268
+ BeforeEach (func () {
269
+ counts , err := fdbCluster .GetCluster ().GetProcessCountsWithDefaults ()
270
+ Expect (err ).NotTo (HaveOccurred ())
271
+
272
+ initialStoragePods = counts .Storage
273
+ // Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
274
+ // but actually is being reset.
275
+ fdbCluster .RunFdbCliCommandInOperator (fmt .Sprintf ("maintenance on %s 7200" , faultDomain ), false , 60 )
276
+ })
277
+
278
+ AfterEach (func () {
279
+ Expect (fdbCluster .ClearBuggifyNoSchedule (false )).NotTo (HaveOccurred ())
280
+ spec := fdbCluster .GetCluster ().Spec .DeepCopy ()
281
+ // Add 3 additional storage Pods.
282
+ spec .ProcessCounts .Storage = initialStoragePods
283
+ fdbCluster .UpdateClusterSpecWithSpec (spec )
284
+ })
285
+
286
+ It ("should not remove the maintenance mode" , func () {
287
+ // Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
288
+ Consistently (func (g Gomega ) fdbv1beta2.FaultDomain {
289
+ status := fdbCluster .GetStatus ()
290
+
291
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
292
+ g .Expect (tracker .State .Healthy ).To (BeTrue ())
293
+ }
294
+
295
+ return status .Cluster .MaintenanceZone
296
+ }).WithTimeout (2 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (faultDomain ))
297
+
298
+ log .Println ("When adding additional storage processes to the cluster" )
299
+ spec := fdbCluster .GetCluster ().Spec .DeepCopy ()
300
+ // Add 3 additional storage Pods.
301
+ spec .ProcessCounts .Storage = initialStoragePods + 3
302
+ fdbCluster .UpdateClusterSpecWithSpec (spec )
303
+
304
+ // Make sure the maintenance mode is kept and the team tracker shows healthy.
305
+ Consistently (func (g Gomega ) fdbv1beta2.FaultDomain {
306
+ status := fdbCluster .GetStatus ()
307
+
308
+ for _ , tracker := range status .Cluster .Data .TeamTrackers {
309
+ g .Expect (tracker .State .Healthy ).To (BeTrue ())
310
+ }
311
+
312
+ return status .Cluster .MaintenanceZone
313
+ }).WithTimeout (5 * time .Minute ).WithPolling (2 * time .Second ).Should (Equal (faultDomain ))
314
+ })
315
+ })
162
316
})
163
317
})
0 commit comments