Skip to content

Commit 70675fd

Browse files
authored
Fix exclusion logic multiple processes (#1721)
* Make sure we verify the exclusion of multiple processes correct
1 parent 9325541 commit 70675fd

File tree

3 files changed

+210
-22
lines changed

3 files changed

+210
-22
lines changed

e2e/test_operator_upgrades/operator_upgrades_test.go

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,10 @@ var _ = AfterSuite(func() {
5959
}
6060
})
6161

62-
func clusterSetup(beforeVersion string, availabilityCheck bool) {
62+
func clusterSetupWithConfig(beforeVersion string, availabilityCheck bool, config *fixtures.ClusterConfig) {
6363
factory.SetBeforeVersion(beforeVersion)
6464
fdbCluster = factory.CreateFdbCluster(
65-
&fixtures.ClusterConfig{
66-
DebugSymbols: false,
67-
},
65+
config,
6866
factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)...,
6967
)
7068

@@ -79,6 +77,12 @@ func clusterSetup(beforeVersion string, availabilityCheck bool) {
7977
).ShouldNot(HaveOccurred())
8078
}
8179

80+
func clusterSetup(beforeVersion string, availabilityCheck bool) {
81+
clusterSetupWithConfig(beforeVersion, availabilityCheck, &fixtures.ClusterConfig{
82+
DebugSymbols: false,
83+
})
84+
}
85+
8286
// Checks if cluster is running at the expectedVersion. This is done by checking the status of the FoundationDBCluster status.
8387
// Before that we checked the cluster status json by checking the reported version of all processes. This approach only worked for
8488
// version compatible upgrades, since incompatible processes won't be part of the cluster anyway. To simplify the check
@@ -1071,4 +1075,64 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() {
10711075
fixtures.GenerateUpgradeTableEntries(testOptions),
10721076
)
10731077

1078+
DescribeTable(
1079+
"with 2 storage servers per Pod",
1080+
func(beforeVersion string, targetVersion string) {
1081+
clusterSetupWithConfig(beforeVersion, true, &fixtures.ClusterConfig{
1082+
DebugSymbols: false,
1083+
StorageServerPerPod: 2,
1084+
})
1085+
1086+
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
1087+
// Make sure the cluster is still running with 2 storage server per Pod.
1088+
Expect(fdbCluster.GetCluster().Spec.StorageServersPerPod).To(Equal(2))
1089+
1090+
if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) {
1091+
// Ensure that the operator is setting the IncorrectConfigMap and IncorrectCommandLine conditions during the upgrade
1092+
// process.
1093+
expectedConditions := map[fdbv1beta2.ProcessGroupConditionType]bool{
1094+
fdbv1beta2.IncorrectConfigMap: true,
1095+
fdbv1beta2.IncorrectCommandLine: true,
1096+
}
1097+
Eventually(func() bool {
1098+
cluster := fdbCluster.GetCluster()
1099+
1100+
for _, processGroup := range cluster.Status.ProcessGroups {
1101+
if !processGroup.MatchesConditions(expectedConditions) {
1102+
return false
1103+
}
1104+
}
1105+
1106+
return true
1107+
}).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).Should(BeTrue())
1108+
}
1109+
1110+
transactionSystemProcessGroups := make(map[fdbv1beta2.ProcessGroupID]fdbv1beta2.None)
1111+
// Wait until the cluster is upgraded and fully reconciled.
1112+
Expect(fdbCluster.WaitUntilWithForceReconcile(2, 600, func(cluster *fdbv1beta2.FoundationDBCluster) bool {
1113+
for _, processGroup := range cluster.Status.ProcessGroups {
1114+
if processGroup.ProcessClass == fdbv1beta2.ProcessClassStorage {
1115+
continue
1116+
}
1117+
1118+
transactionSystemProcessGroups[processGroup.ProcessGroupID] = fdbv1beta2.None{}
1119+
}
1120+
1121+
// Allow soft reconciliation and make sure the running version was updated
1122+
return cluster.Status.Generations.Reconciled == cluster.Generation && cluster.Status.RunningVersion == targetVersion
1123+
})).NotTo(HaveOccurred())
1124+
1125+
// Get the desired process counts based on the current cluster configuration
1126+
processCounts, err := fdbCluster.GetProcessCounts()
1127+
Expect(err).NotTo(HaveOccurred())
1128+
1129+
// During an upgrade we expect that the transaction system processes are replaced, so we expect to have seen
1130+
// 2 times the process counts for transaction system processes. Add a small buffer of 5 to allow automatic
1131+
// replacements during an upgrade.
1132+
expectedProcessCounts := (processCounts.Total()-processCounts.Storage)*2 + 5
1133+
Expect(len(transactionSystemProcessGroups)).To(BeNumerically("<=", expectedProcessCounts))
1134+
},
1135+
EntryDescription("Upgrade from %[1]s to %[2]s"),
1136+
fixtures.GenerateUpgradeTableEntries(testOptions),
1137+
)
10741138
})

pkg/fdbstatus/status_checks.go

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ type exclusionStatus struct {
5050
// getRemainingAndExcludedFromStatus checks which processes of the input address list are excluded in the cluster and which are not.
5151
func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.FoundationDBStatus, addresses []fdbv1beta2.ProcessAddress) exclusionStatus {
5252
notExcludedAddresses := map[string]fdbv1beta2.None{}
53-
fullyExcludedAddresses := map[string]fdbv1beta2.None{}
54-
visitedAddresses := map[string]fdbv1beta2.None{}
53+
fullyExcludedAddresses := map[string]int{}
54+
visitedAddresses := map[string]int{}
5555

5656
// If there are more than 1 active generations we can not handout any information about excluded processes based on
5757
// the cluster status information as only the latest log processes will have the log process role. If we don't check
@@ -67,54 +67,63 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
6767
}
6868
}
6969

70+
addressesToVerify := map[string]fdbv1beta2.None{}
7071
for _, addr := range addresses {
71-
notExcludedAddresses[addr.MachineAddress()] = fdbv1beta2.None{}
72+
addressesToVerify[addr.MachineAddress()] = fdbv1beta2.None{}
7273
}
7374

7475
// Check in the status output which processes are already marked for exclusion in the cluster
7576
for _, process := range status.Cluster.Processes {
76-
if _, ok := notExcludedAddresses[process.Address.MachineAddress()]; !ok {
77+
if _, ok := addressesToVerify[process.Address.MachineAddress()]; !ok {
7778
continue
7879
}
7980

80-
visitedAddresses[process.Address.MachineAddress()] = fdbv1beta2.None{}
81+
visitedAddresses[process.Address.MachineAddress()]++
8182
if !process.Excluded {
83+
notExcludedAddresses[process.Address.MachineAddress()] = fdbv1beta2.None{}
8284
continue
8385
}
8486

8587
if len(process.Roles) == 0 {
86-
fullyExcludedAddresses[process.Address.MachineAddress()] = fdbv1beta2.None{}
88+
fullyExcludedAddresses[process.Address.MachineAddress()]++
8789
}
88-
89-
delete(notExcludedAddresses, process.Address.MachineAddress())
9090
}
9191

9292
exclusions := exclusionStatus{
93-
inProgress: make([]fdbv1beta2.ProcessAddress, 0, len(addresses)-len(notExcludedAddresses)-len(fullyExcludedAddresses)),
93+
inProgress: make([]fdbv1beta2.ProcessAddress, 0, len(addresses)),
9494
fullyExcluded: make([]fdbv1beta2.ProcessAddress, 0, len(fullyExcludedAddresses)),
9595
notExcluded: make([]fdbv1beta2.ProcessAddress, 0, len(notExcludedAddresses)),
96-
missingInStatus: make([]fdbv1beta2.ProcessAddress, 0, len(notExcludedAddresses)),
96+
missingInStatus: make([]fdbv1beta2.ProcessAddress, 0, len(addresses)-len(visitedAddresses)),
9797
}
9898

9999
for _, addr := range addresses {
100+
machine := addr.MachineAddress()
100101
// If we didn't visit that address (absent in the cluster status) we assume it's safe to run the exclude command against it.
101102
// We have to run the exclude command against those addresses, to make sure they are not serving any roles.
102-
if _, ok := visitedAddresses[addr.MachineAddress()]; !ok {
103+
visitedCount, visited := visitedAddresses[machine]
104+
if !visited {
103105
exclusions.missingInStatus = append(exclusions.missingInStatus, addr)
104106
continue
105107
}
106108

107109
// Those addresses are not excluded, so it's not safe to start the exclude command to check if they are fully excluded.
108-
if _, ok := notExcludedAddresses[addr.MachineAddress()]; ok {
110+
if _, ok := notExcludedAddresses[machine]; ok {
109111
exclusions.notExcluded = append(exclusions.notExcluded, addr)
110112
continue
111113
}
112114

113115
// Those are the processes that are marked as excluded and are not serving any roles. It's safe to delete Pods
114116
// that host those processes.
115-
if _, ok := fullyExcludedAddresses[addr.MachineAddress()]; ok {
116-
exclusions.fullyExcluded = append(exclusions.fullyExcluded, addr)
117-
continue
117+
excludedCount, ok := fullyExcludedAddresses[addr.MachineAddress()]
118+
if ok {
119+
// We have to make sure that we have visited as many processes as we have seen fully excluded. Otherwise we might
120+
// return a wrong signal if more than one process is used per Pod. In this case we have to wait for all processes
121+
// to be fully excluded.
122+
if visitedCount == excludedCount {
123+
exclusions.fullyExcluded = append(exclusions.fullyExcluded, addr)
124+
continue
125+
}
126+
logger.Info("found excluded addresses for machine, but not all processes are fully excluded", "visitedCount", visitedCount, "excludedCount", excludedCount, "machine", machine)
118127
}
119128

120129
// Those are the processes that are marked as excluded but still serve at least one role.

pkg/fdbstatus/status_checks_test.go

Lines changed: 118 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,16 @@ var _ = Describe("status_checks", func() {
6464
}
6565

6666
DescribeTable("fetching the excluded and remaining processes from the status",
67-
func(status *fdbv1beta2.FoundationDBStatus, addresses []fdbv1beta2.ProcessAddress, expectedExcluded []fdbv1beta2.ProcessAddress, expectedRemaining []fdbv1beta2.ProcessAddress, expectedFullyExcluded []fdbv1beta2.ProcessAddress, expectedMissing []fdbv1beta2.ProcessAddress) {
67+
func(status *fdbv1beta2.FoundationDBStatus,
68+
addresses []fdbv1beta2.ProcessAddress,
69+
expectedInProgress []fdbv1beta2.ProcessAddress,
70+
expectedNotExcluded []fdbv1beta2.ProcessAddress,
71+
expectedFullyExcluded []fdbv1beta2.ProcessAddress,
72+
expectedMissing []fdbv1beta2.ProcessAddress) {
73+
6874
exclusions := getRemainingAndExcludedFromStatus(logr.Discard(), status, addresses)
69-
Expect(expectedExcluded).To(ConsistOf(exclusions.inProgress))
70-
Expect(expectedRemaining).To(ConsistOf(exclusions.notExcluded))
75+
Expect(expectedInProgress).To(ConsistOf(exclusions.inProgress))
76+
Expect(expectedNotExcluded).To(ConsistOf(exclusions.notExcluded))
7177
Expect(expectedFullyExcluded).To(ConsistOf(exclusions.fullyExcluded))
7278
Expect(expectedMissing).To(ConsistOf(exclusions.missingInStatus))
7379
},
@@ -146,6 +152,115 @@ var _ = Describe("status_checks", func() {
146152
nil,
147153
nil,
148154
),
155+
Entry("when the process group has multiple processes and only one is fully excluded",
156+
&fdbv1beta2.FoundationDBStatus{
157+
Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{
158+
Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{
159+
"1": {
160+
Address: addr1,
161+
Excluded: true,
162+
},
163+
"2": {
164+
Address: addr2,
165+
},
166+
"3": {
167+
Address: addr3,
168+
},
169+
"4-1": {
170+
Address: addr4,
171+
Excluded: true,
172+
Locality: map[string]string{
173+
fdbv1beta2.FDBLocalityProcessIDKey: "4-1",
174+
},
175+
},
176+
"4-2": {
177+
Address: addr4,
178+
Excluded: true,
179+
Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{
180+
{
181+
Role: string(fdbv1beta2.ProcessRoleStorage),
182+
},
183+
},
184+
Locality: map[string]string{
185+
fdbv1beta2.FDBLocalityProcessIDKey: "4-2",
186+
},
187+
},
188+
},
189+
},
190+
},
191+
[]fdbv1beta2.ProcessAddress{addr4},
192+
[]fdbv1beta2.ProcessAddress{addr4},
193+
nil,
194+
nil,
195+
nil,
196+
),
197+
Entry("when the process group has multiple processes and both are fully excluded",
198+
&fdbv1beta2.FoundationDBStatus{
199+
Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{
200+
Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{
201+
"1": {
202+
Address: addr1,
203+
Excluded: true,
204+
},
205+
"2": {
206+
Address: addr2,
207+
},
208+
"3": {
209+
Address: addr3,
210+
},
211+
"4-1": {
212+
Address: addr4,
213+
Excluded: true,
214+
},
215+
"4-2": {
216+
Address: addr4,
217+
Excluded: true,
218+
},
219+
},
220+
},
221+
},
222+
[]fdbv1beta2.ProcessAddress{addr4},
223+
nil,
224+
nil,
225+
[]fdbv1beta2.ProcessAddress{addr4},
226+
nil,
227+
),
228+
Entry("when the process group has multiple processes and only one is excluded",
229+
&fdbv1beta2.FoundationDBStatus{
230+
Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{
231+
Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{
232+
"1": {
233+
Address: addr1,
234+
Excluded: true,
235+
},
236+
"2": {
237+
Address: addr2,
238+
},
239+
"3": {
240+
Address: addr3,
241+
},
242+
"4-1": {
243+
Address: addr4,
244+
Excluded: true,
245+
},
246+
"4-2": {
247+
Address: addr4,
248+
Excluded: false,
249+
Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{
250+
{
251+
Role: string(fdbv1beta2.ProcessRoleStorage),
252+
},
253+
},
254+
},
255+
},
256+
},
257+
},
258+
[]fdbv1beta2.ProcessAddress{addr4},
259+
nil,
260+
[]fdbv1beta2.ProcessAddress{addr4},
261+
nil,
262+
nil,
263+
),
149264
)
150265
})
151266

0 commit comments

Comments
 (0)