Skip to content

Commit 06f1d04

Browse files
authored
Add additional tests for the maintenance mode (#1921)
1 parent 6d799f5 commit 06f1d04

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed

e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ var _ = BeforeSuite(func() {
5353
factory.GetClusterOptions()...,
5454
)
5555

56+
// Make sure the unschedulable Pod is not removed
57+
Expect(fdbCluster.SetAutoReplacements(false, 12*time.Hour)).NotTo(HaveOccurred())
58+
5659
// Load some data into the cluster.
5760
factory.CreateDataLoaderIfAbsent(fdbCluster)
5861
})
@@ -66,6 +69,7 @@ var _ = AfterSuite(func() {
6669

6770
var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
6871
AfterEach(func() {
72+
Expect(fdbCluster.ClearBuggifyNoSchedule(false)).NotTo(HaveOccurred())
6973
if CurrentSpecReport().Failed() {
7074
factory.DumpState(fdbCluster)
7175
}
@@ -159,5 +163,155 @@ var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
159163
log.Println("It took:", time.Since(startTime).String(), "to detected the failure")
160164
})
161165
})
166+
167+
When("there is a failure during maintenance mode", func() {
168+
BeforeEach(func() {
169+
// Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
170+
// but actually is being reset.
171+
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 7200", faultDomain), false, 60)
172+
})
173+
174+
AfterEach(func() {
175+
fdbCluster.SetCrashLoopContainers(nil, false)
176+
})
177+
178+
It("should remove the maintenance mode", func() {
179+
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
180+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
181+
status := fdbCluster.GetStatus()
182+
183+
for _, tracker := range status.Cluster.Data.TeamTrackers {
184+
g.Expect(tracker.State.Healthy).To(BeTrue())
185+
}
186+
187+
return status.Cluster.MaintenanceZone
188+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
189+
190+
log.Println("When another storage Pod is failing")
191+
var podToRecreate corev1.Pod
192+
193+
for _, pod := range fdbCluster.GetStoragePods().Items {
194+
if pod.Name == failingStoragePod.Name {
195+
continue
196+
}
197+
198+
podToRecreate = pod
199+
break
200+
}
201+
202+
// We delete a Pod and make it crash-looping, to make sure the process is failed for more than 60 seconds.
203+
log.Println("Delete Pod", podToRecreate.Name)
204+
fdbCluster.SetCrashLoopContainers([]fdbv1beta2.CrashLoopContainerObject{
205+
{
206+
ContainerName: fdbv1beta2.MainContainerName,
207+
Targets: []fdbv1beta2.ProcessGroupID{fixtures.GetProcessGroupID(podToRecreate)},
208+
},
209+
}, false)
210+
factory.DeletePod(&podToRecreate)
211+
212+
log.Println("Wait until maintenance mode is reset")
213+
startTime := time.Now()
214+
// We would expect that the team tracker gets unhealthy once the maintenance mode is timed out.
215+
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
216+
status := fdbCluster.GetStatus()
217+
for _, tracker := range status.Cluster.Data.TeamTrackers {
218+
g.Expect(tracker.State.Healthy).To(BeFalse())
219+
}
220+
221+
return status.Cluster.MaintenanceZone
222+
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))
223+
224+
log.Println("It took:", time.Since(startTime).String(), "to detected the failure")
225+
})
226+
})
227+
228+
When("there is additional load on the cluster", func() {
229+
BeforeEach(func() {
230+
// Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
231+
// but actually is being reset.
232+
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 7200", faultDomain), false, 60)
233+
})
234+
235+
AfterEach(func() {
236+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
237+
status := fdbCluster.GetStatus()
238+
239+
for _, tracker := range status.Cluster.Data.TeamTrackers {
240+
g.Expect(tracker.State.Healthy).To(BeTrue())
241+
}
242+
243+
return status.Cluster.MaintenanceZone
244+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
245+
})
246+
247+
It("should not remove the maintenance mode", func() {
248+
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
249+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
250+
status := fdbCluster.GetStatus()
251+
252+
for _, tracker := range status.Cluster.Data.TeamTrackers {
253+
g.Expect(tracker.State.Healthy).To(BeTrue())
254+
}
255+
256+
return status.Cluster.MaintenanceZone
257+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
258+
259+
log.Println("When loading additional data into the cluster")
260+
factory.CreateDataLoaderIfAbsent(fdbCluster)
261+
factory.CreateDataLoaderIfAbsent(fdbCluster)
262+
})
263+
})
264+
265+
When("the number of storage Pods is changed", func() {
266+
var initialStoragePods int
267+
268+
BeforeEach(func() {
269+
counts, err := fdbCluster.GetCluster().GetProcessCountsWithDefaults()
270+
Expect(err).NotTo(HaveOccurred())
271+
272+
initialStoragePods = counts.Storage
273+
// Set the maintenance mode for a long duration, e.g. 2h hours to make sure the mode is not timing out
274+
// but actually is being reset.
275+
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 7200", faultDomain), false, 60)
276+
})
277+
278+
AfterEach(func() {
279+
Expect(fdbCluster.ClearBuggifyNoSchedule(false)).NotTo(HaveOccurred())
280+
spec := fdbCluster.GetCluster().Spec.DeepCopy()
281+
// Add 3 additional storage Pods.
282+
spec.ProcessCounts.Storage = initialStoragePods
283+
fdbCluster.UpdateClusterSpecWithSpec(spec)
284+
})
285+
286+
It("should not remove the maintenance mode", func() {
287+
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
288+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
289+
status := fdbCluster.GetStatus()
290+
291+
for _, tracker := range status.Cluster.Data.TeamTrackers {
292+
g.Expect(tracker.State.Healthy).To(BeTrue())
293+
}
294+
295+
return status.Cluster.MaintenanceZone
296+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
297+
298+
log.Println("When adding additional storage processes to the cluster")
299+
spec := fdbCluster.GetCluster().Spec.DeepCopy()
300+
// Add 3 additional storage Pods.
301+
spec.ProcessCounts.Storage = initialStoragePods + 3
302+
fdbCluster.UpdateClusterSpecWithSpec(spec)
303+
304+
// Make sure the maintenance mode is kept and the team tracker shows healthy.
305+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
306+
status := fdbCluster.GetStatus()
307+
308+
for _, tracker := range status.Cluster.Data.TeamTrackers {
309+
g.Expect(tracker.State.Healthy).To(BeTrue())
310+
}
311+
312+
return status.Cluster.MaintenanceZone
313+
}).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
314+
})
315+
})
162316
})
163317
})

0 commit comments

Comments
 (0)