Skip to content

Commit a1e5ba0

Browse files
authored
Verify the processes that are excluded (#1752)
* Verify the processes that are assumed to be excluded by the machine-readable status are also excluded based on the exclude command
1 parent d28bd2b commit a1e5ba0

File tree

3 files changed

+75
-24
lines changed

3 files changed

+75
-24
lines changed

fdbclient/admin_client_test.go

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,7 @@ protocol fdb00b071010000`,
902902
}
903903

904904
mockRunner = &mockCommandRunner{
905-
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
905+
mockedError: nil,
906906
mockedOutput: "",
907907
}
908908
})
@@ -926,8 +926,39 @@ protocol fdb00b071010000`,
926926
Expect(err).NotTo(HaveOccurred())
927927
})
928928

929-
It("should not issue an exclude command", func() {
930-
Expect(mockRunner.receivedBinary).To(BeEmpty())
929+
It("should issue an exclude command to verify the exclusion", func() {
930+
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
931+
Expect(mockRunner.receivedArgs).To(ContainElements("exclude 192.168.0.1:4500 192.168.0.2:4500"))
932+
})
933+
})
934+
935+
When("all provided processes are fully excluded in the status but the exclude command returns an error", func() {
936+
BeforeEach(func() {
937+
addressesToCheck = []fdbv1beta2.ProcessAddress{
938+
{
939+
IPAddress: net.ParseIP("192.168.0.1"),
940+
Port: 4500,
941+
},
942+
{
943+
IPAddress: net.ParseIP("192.168.0.2"),
944+
Port: 4500,
945+
},
946+
}
947+
948+
mockRunner = &mockCommandRunner{
949+
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
950+
mockedOutput: "",
951+
}
952+
})
953+
954+
It("should return an empty list and an error", func() {
955+
Expect(result).To(HaveLen(0))
956+
Expect(err).To(HaveOccurred())
957+
})
958+
959+
It("should issue an exclude command to verify the exclusion", func() {
960+
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
961+
Expect(mockRunner.receivedArgs).To(ContainElements("exclude 192.168.0.1:4500 192.168.0.2:4500"))
931962
})
932963
})
933964

@@ -947,6 +978,11 @@ protocol fdb00b071010000`,
947978
Port: 4500,
948979
},
949980
}
981+
982+
mockRunner = &mockCommandRunner{
983+
mockedError: nil,
984+
mockedOutput: "",
985+
}
950986
})
951987

952988
It("should return the one process that is not excluded", func() {
@@ -957,8 +993,8 @@ protocol fdb00b071010000`,
957993
Expect(err).NotTo(HaveOccurred())
958994
})
959995

960-
It("should not issue an exclude command", func() {
961-
Expect(mockRunner.receivedBinary).To(BeEmpty())
996+
It("should issue an exclude command to verify the exclusion", func() {
997+
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
962998
})
963999
})
9641000

@@ -988,8 +1024,8 @@ protocol fdb00b071010000`,
9881024
Expect(err).NotTo(HaveOccurred())
9891025
})
9901026

991-
It("should not issue an exclude command", func() {
992-
Expect(mockRunner.receivedBinary).To(BeEmpty())
1027+
It("should issue an exclude command to verify the exclusion", func() {
1028+
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
9931029
})
9941030
})
9951031

@@ -1028,12 +1064,12 @@ protocol fdb00b071010000`,
10281064
Expect(err).NotTo(HaveOccurred())
10291065
})
10301066

1031-
It("should not issue an exclude command", func() {
1032-
Expect(mockRunner.receivedBinary).To(BeEmpty())
1067+
It("should issue an exclude command to verify the exclusion", func() {
1068+
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
10331069
})
10341070
})
10351071

1036-
When("one process is missing in the cluster status", func() {
1072+
When("one process is missing in the cluster status and the exclude command times out", func() {
10371073
BeforeEach(func() {
10381074
addressesToCheck = []fdbv1beta2.ProcessAddress{
10391075
{
@@ -1049,6 +1085,11 @@ protocol fdb00b071010000`,
10491085
Port: 4500,
10501086
},
10511087
}
1088+
1089+
mockRunner = &mockCommandRunner{
1090+
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
1091+
mockedOutput: "",
1092+
}
10521093
})
10531094

10541095
It("should return an empty list and no error", func() {

internal/removals/remove.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ func GetProcessGroupsToRemove(removalMode fdbv1beta2.PodUpdateMode, removals map
8484
// If the process group has not an associated process in the cluster status the zone will be UnknownZone.
8585
// if the process group has the ResourcesTerminating condition the zone will be TerminatingZone.
8686
func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemove []*fdbv1beta2.ProcessGroupStatus) (map[string][]fdbv1beta2.ProcessGroupID, int64, error) {
87-
var lastestRemovalTimestamp int64
87+
var latestRemovalTimestamp int64
8888
// Convert the process list into a map with the process group ID as key.
8989
processInfo := map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{}
9090
for _, p := range status.Cluster.Processes {
@@ -98,8 +98,8 @@ func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemo
9898
// that state.
9999
removalTimestamp := pointer.Int64Deref(pg.GetConditionTime(fdbv1beta2.ResourcesTerminating), 0)
100100
if removalTimestamp > 0 {
101-
if removalTimestamp > lastestRemovalTimestamp {
102-
lastestRemovalTimestamp = removalTimestamp
101+
if removalTimestamp > latestRemovalTimestamp {
102+
latestRemovalTimestamp = removalTimestamp
103103
}
104104
zoneMap[TerminatingZone] = append(zoneMap[TerminatingZone], pg.ProcessGroupID)
105105
continue
@@ -115,12 +115,11 @@ func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemo
115115
zoneMap[zone] = append(zoneMap[zone], pg.ProcessGroupID)
116116
}
117117

118-
return zoneMap, lastestRemovalTimestamp, nil
118+
return zoneMap, latestRemovalTimestamp, nil
119119
}
120120

121121
// GetRemainingMap returns a map that indicates if a process group is fully excluded in the cluster.
122122
func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) (map[string]bool, error) {
123-
var err error
124123
addresses := make([]fdbv1beta2.ProcessAddress, 0, len(cluster.Status.ProcessGroups))
125124
for _, processGroup := range cluster.Status.ProcessGroups {
126125
if !processGroup.IsMarkedForRemoval() || processGroup.IsExcluded() {
@@ -141,19 +140,20 @@ func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient,
141140
}
142141
}
143142

144-
var remaining []fdbv1beta2.ProcessAddress
145-
if len(addresses) > 0 {
146-
remaining, err = fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
147-
if err != nil {
148-
return map[string]bool{}, err
149-
}
143+
remainingMap := map[string]bool{}
144+
if len(addresses) == 0 {
145+
return remainingMap, nil
146+
}
147+
148+
remaining, err := fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
149+
if err != nil {
150+
return nil, err
150151
}
151152

152153
if len(remaining) > 0 {
153154
logger.Info("Exclusions to complete", "remainingServers", remaining)
154155
}
155156

156-
remainingMap := make(map[string]bool, len(remaining))
157157
for _, address := range addresses {
158158
remainingMap[address.String()] = false
159159
}

pkg/fdbstatus/status_checks.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
124124
}
125125

126126
if len(process.Roles) == 0 {
127+
logger.Info("found fully excluded process without any roles", "process", process)
127128
fullyExcludedAddresses[process.Address.MachineAddress()]++
128129
}
129130
}
@@ -216,12 +217,21 @@ func CanSafelyRemoveFromStatus(logger logr.Logger, client fdbadminclient.AdminCl
216217
}
217218
}
218219

220+
// Verify that all processes that are assumed to be fully excluded based on the machine-readable status are actually
221+
// not serving any roles by running the exclude command again. If those processes are actually fully excluded and are not
222+
// serving any roles, the exclude command should terminate quickly, otherwise we will hit a timeout, and we know that
223+
// not all processes are fully excluded. This is meant to be an additional safeguard if the machine-readable status
224+
// returns the wrong signals.
225+
if len(exclusions.fullyExcluded) > 0 {
226+
// When we hit a timeout error here we know that at least one of the fullyExcluded is still not fully excluded.
227+
return notSafeToDelete, client.ExcludeProcesses(exclusions.fullyExcluded)
228+
}
229+
219230
// All processes that are either not yet marked as excluded or still serving at least one role, cannot be removed safely.
220231
return notSafeToDelete, nil
221232
}
222233

223-
// GetExclusions gets a list of the addresses currently excluded from the
224-
// database, based on the provided status.
234+
// GetExclusions gets a list of the addresses currently excluded from the database, based on the provided status.
225235
func GetExclusions(status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, error) {
226236
excludedServers := status.Cluster.DatabaseConfiguration.ExcludedServers
227237
exclusions := make([]fdbv1beta2.ProcessAddress, 0, len(excludedServers))

0 commit comments

Comments
 (0)