Skip to content

Commit d98e1ea

Browse files
authored
Add e2e test case for maintenance mode interaction (#1915)
1 parent 6d2439f commit d98e1ea

File tree

3 files changed

+199
-2
lines changed

3 files changed

+199
-2
lines changed

e2e/fixtures/fdb_cluster.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -748,8 +748,8 @@ func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) error {
748748

749749
// SetProcessGroupsAsUnschedulable sets the provided process groups on the NoSchedule list of the current FoundationDBCluster. This will make
750750
// sure that the Pod is stuck in Pending.
751-
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(procesGroups []fdbv1beta2.ProcessGroupID) {
752-
fdbCluster.cluster.Spec.Buggify.NoSchedule = procesGroups
751+
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(processGroups []fdbv1beta2.ProcessGroupID) {
752+
fdbCluster.cluster.Spec.Buggify.NoSchedule = processGroups
753753
fdbCluster.UpdateClusterSpec()
754754
}
755755

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/*
2+
* operator_maintenance_mode_test.go
3+
*
4+
* This source file is part of the FoundationDB open source project
5+
*
6+
* Copyright 2024 Apple Inc. and the FoundationDB project authors
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package operatorha
22+
23+
/*
24+
This test suite includes tests around the interaction of the maintenance mode and the operator.
25+
*/
26+
27+
import (
28+
"fmt"
29+
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
30+
corev1 "k8s.io/api/core/v1"
31+
"log"
32+
"time"
33+
34+
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
35+
. "github.com/onsi/ginkgo/v2"
36+
. "github.com/onsi/gomega"
37+
)
38+
39+
var (
40+
factory *fixtures.Factory
41+
fdbCluster *fixtures.FdbCluster
42+
testOptions *fixtures.FactoryOptions
43+
)
44+
45+
func init() {
46+
testOptions = fixtures.InitFlags()
47+
}
48+
49+
var _ = BeforeSuite(func() {
50+
factory = fixtures.CreateFactory(testOptions)
51+
fdbCluster = factory.CreateFdbCluster(
52+
fixtures.DefaultClusterConfig(false),
53+
factory.GetClusterOptions()...,
54+
)
55+
56+
// Load some data into the cluster.
57+
factory.CreateDataLoaderIfAbsent(fdbCluster)
58+
})
59+
60+
var _ = AfterSuite(func() {
61+
if CurrentSpecReport().Failed() {
62+
log.Printf("failed due to %s", CurrentSpecReport().FailureMessage())
63+
}
64+
factory.Shutdown()
65+
})
66+
67+
var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
68+
AfterEach(func() {
69+
if CurrentSpecReport().Failed() {
70+
factory.DumpState(fdbCluster)
71+
}
72+
Expect(fdbCluster.WaitForReconciliation()).ToNot(HaveOccurred())
73+
factory.StopInvariantCheck()
74+
// Make sure all data is present in the cluster
75+
fdbCluster.EnsureTeamTrackersAreHealthy()
76+
fdbCluster.EnsureTeamTrackersHaveMinReplicas()
77+
})
78+
79+
When("the maintenance mode is set", func() {
80+
var failingStoragePod corev1.Pod
81+
var faultDomain fdbv1beta2.FaultDomain
82+
83+
BeforeEach(func() {
84+
failingStoragePod = fixtures.RandomPickOnePod(fdbCluster.GetStoragePods().Items)
85+
86+
// Set maintenance mode for this Pod
87+
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
88+
if processGroup.ProcessClass != fdbv1beta2.ProcessClassStorage {
89+
continue
90+
}
91+
92+
if processGroup.ProcessGroupID == fixtures.GetProcessGroupID(failingStoragePod) {
93+
faultDomain = processGroup.FaultDomain
94+
}
95+
}
96+
97+
// Set the maintenance mode for 4 minutes.
98+
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 240", faultDomain), false, 60)
99+
100+
// Set this Pod as unschedulable to keep it pending.
101+
Expect(fdbCluster.SetPodAsUnschedulable(failingStoragePod)).NotTo(HaveOccurred())
102+
})
103+
104+
AfterEach(func() {
105+
// Make sure that the quota is deleted and new PVCs can be created.
106+
Expect(fdbCluster.ClearBuggifyNoSchedule(true)).NotTo(HaveOccurred())
107+
// Reset the maintenance mode
108+
fdbCluster.RunFdbCliCommandInOperator("maintenance off", false, 60)
109+
})
110+
111+
When("the Pod comes back before the maintenance mode times out", func() {
112+
It("should not set the team tracker status to unhealthy", func() {
113+
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
114+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
115+
status := fdbCluster.GetStatus()
116+
117+
for _, tracker := range status.Cluster.Data.TeamTrackers {
118+
log.Println(tracker.State.Name, ":", tracker.State.Healthy)
119+
g.Expect(tracker.State.Healthy).To(BeTrue())
120+
}
121+
122+
log.Println("Maintenance Zone:", status.Cluster.MaintenanceZone)
123+
return status.Cluster.MaintenanceZone
124+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
125+
})
126+
})
127+
128+
When("the maintenance mode times out", func() {
129+
It("should update the team tracker status to unhealthy", func() {
130+
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
131+
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
132+
status := fdbCluster.GetStatus()
133+
134+
for _, tracker := range status.Cluster.Data.TeamTrackers {
135+
g.Expect(tracker.State.Healthy).To(BeTrue())
136+
}
137+
138+
return status.Cluster.MaintenanceZone
139+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
140+
141+
log.Println("Wait until maintenance mode times out")
142+
// Wait until the maintenance zone is reset
143+
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
144+
return fdbCluster.GetStatus().Cluster.MaintenanceZone
145+
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))
146+
147+
startTime := time.Now()
148+
log.Println("Wait until failure is detected")
149+
// We would expect that the team tracker gets unhealthy once the maintenance mode is timed out.
150+
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
151+
status := fdbCluster.GetStatus()
152+
for _, tracker := range status.Cluster.Data.TeamTrackers {
153+
g.Expect(tracker.State.Healthy).To(BeFalse())
154+
}
155+
156+
return status.Cluster.MaintenanceZone
157+
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))
158+
159+
log.Println("It took:", time.Since(startTime).String(), "to detected the failure")
160+
})
161+
})
162+
})
163+
})
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* suite_test.go
3+
*
4+
* This source file is part of the FoundationDB open source project
5+
*
6+
* Copyright 2024 Apple Inc. and the FoundationDB project authors
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package operatorha
22+
23+
import (
24+
"testing"
25+
"time"
26+
27+
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
28+
"github.com/onsi/gomega"
29+
)
30+
31+
func TestOperatorHA(t *testing.T) {
32+
gomega.SetDefaultEventuallyTimeout(10 * time.Second)
33+
fixtures.RunGinkgoTests(t, "Operator maintenance mode test suite")
34+
}

0 commit comments

Comments
 (0)