Skip to content

Commit f80c420

Browse files
authored
Try to batch the exclusions of transaction system processes together (#2158)
1 parent bf4bd1b commit f80c420

File tree

2 files changed

+112
-1
lines changed

2 files changed

+112
-1
lines changed

controllers/exclude_processes.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,25 @@ func (e excludeProcesses) reconcile(ctx context.Context, r *FoundationDBClusterR
113113
return &requeue{curError: err, delayedRequeue: true}
114114
}
115115

116+
// transactionSystemExclusionAllowed will keep track if the exclusion is allowed and if the operator is allowed to
117+
// exclude processes from the transaction system. If multiple processes from different processes classes that are part
118+
// of the transaction system should be excluded, the operator will expect that the exclusion is allowed for all
119+
// transaction system processes. The idea here is to reduce the number of recoveries during transaction system
120+
// migrations as the stateless pods are often created much faster than the log pod as the stateless pods don't have
121+
// to wait for the storage provisioning.
122+
transactionSystemExclusionAllowed := true
116123
desiredProcessesMap := desiredProcesses.Map()
117124
for processClass := range fdbProcessesToExcludeByClass {
118125
contextLogger := logger.WithValues("processClass", processClass)
119126
ongoingExclusions := ongoingExclusionsByClass[processClass]
120127
processesToExclude := fdbProcessesToExcludeByClass[processClass]
121128

122129
allowedExclusions, missingProcesses := getAllowedExclusionsAndMissingProcesses(contextLogger, cluster, processClass, desiredProcessesMap[processClass], ongoingExclusions, r.InSimulation)
130+
// TODO (johscheuer): Should we also batch exclusions for storage servers? Those should be rare compared to replacements in the transaction system.
123131
if allowedExclusions <= 0 {
132+
if processClass.IsTransaction() {
133+
transactionSystemExclusionAllowed = false
134+
}
124135
contextLogger.Info("Waiting for missing processes before continuing with the exclusion", "missingProcesses", missingProcesses, "addressesToExclude", processesToExclude, "allowedExclusions", allowedExclusions, "ongoingExclusions", ongoingExclusions)
125136
continue
126137
}
@@ -134,7 +145,7 @@ func (e excludeProcesses) reconcile(ctx context.Context, r *FoundationDBClusterR
134145
allowedExclusions = len(processesToExclude)
135146
}
136147

137-
// TODO: As a next step we could exclude transaction (log + stateless) processes together and exclude
148+
// TODO (johscheuer): As a next step we could exclude transaction (log + stateless) processes together and exclude
138149
// storage processes with a separate call. This would make sure that no storage checks will block
139150
// the exclusion of transaction processes.
140151

@@ -149,6 +160,15 @@ func (e excludeProcesses) reconcile(ctx context.Context, r *FoundationDBClusterR
149160
}
150161
}
151162

163+
// In case that there are processes from different transaction process classes, we expect that the operator is allowed
164+
// to exclude processes from all the different process classes. If not the operator will delay the exclusion.
165+
if !transactionSystemExclusionAllowed {
166+
return &requeue{
167+
message: "more exclusions needed but not allowed, have to wait until new processes for the transaction system are up to reduce number of recoveries.",
168+
delayedRequeue: true,
169+
}
170+
}
171+
152172
var coordinatorExcluded bool
153173
for _, excludeProcess := range fdbProcessesToExclude {
154174
excludeString := excludeProcess.String()

controllers/exclude_processes_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,97 @@ var _ = Describe("exclude_processes", func() {
830830
})
831831
})
832832
})
833+
834+
When("transaction system processes should be excluded", func() {
835+
When("a stateless and a log process should be excluded", func() {
836+
BeforeEach(func() {
837+
var pickedStateless, pickedLog bool
838+
839+
_, processGroupIDs, err := cluster.GetCurrentProcessGroupsAndProcessCounts()
840+
Expect(err).NotTo(HaveOccurred())
841+
cluster.Status.ProcessGroups = append(cluster.Status.ProcessGroups, fdbv1beta2.NewProcessGroupStatus(cluster.GetNextRandomProcessGroupID(fdbv1beta2.ProcessClassLog, processGroupIDs[fdbv1beta2.ProcessClassLog]), fdbv1beta2.ProcessClassLog, nil))
842+
cluster.Status.ProcessGroups[len(cluster.Status.ProcessGroups)-1].ProcessGroupConditions = nil
843+
cluster.Status.ProcessGroups = append(cluster.Status.ProcessGroups, fdbv1beta2.NewProcessGroupStatus(cluster.GetNextRandomProcessGroupID(fdbv1beta2.ProcessClassStateless, processGroupIDs[fdbv1beta2.ProcessClassStateless]), fdbv1beta2.ProcessClassStateless, nil))
844+
cluster.Status.ProcessGroups[len(cluster.Status.ProcessGroups)-1].ProcessGroupConditions = nil
845+
846+
markedForRemoval := make([]fdbv1beta2.ProcessGroupID, 0, 2)
847+
for _, processGroup := range cluster.Status.ProcessGroups {
848+
if !processGroup.ProcessClass.IsTransaction() || processGroup.ProcessClass == fdbv1beta2.ProcessClassClusterController {
849+
continue
850+
}
851+
852+
if pickedLog && pickedStateless {
853+
break
854+
}
855+
856+
if !processGroup.ProcessClass.IsLogProcess() {
857+
if pickedStateless {
858+
continue
859+
}
860+
pickedStateless = true
861+
}
862+
863+
if processGroup.ProcessClass.IsLogProcess() {
864+
if pickedLog {
865+
continue
866+
}
867+
pickedLog = true
868+
}
869+
870+
processGroup.MarkForRemoval()
871+
markedForRemoval = append(markedForRemoval, processGroup.ProcessGroupID)
872+
}
873+
874+
Expect(markedForRemoval).To(HaveLen(2))
875+
})
876+
877+
When("no processes are missing", func() {
878+
It("should exclude the process", func() {
879+
adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
880+
Expect(err).NotTo(HaveOccurred())
881+
882+
Expect(req).To(BeNil())
883+
Expect(adminClient.ExcludedAddresses).To(HaveLen(2))
884+
})
885+
})
886+
887+
When("a transaction process is missing", func() {
888+
var missingProcssGroup *fdbv1beta2.ProcessGroupStatus
889+
890+
BeforeEach(func() {
891+
_, processGroupIDs, err := cluster.GetCurrentProcessGroupsAndProcessCounts()
892+
Expect(err).NotTo(HaveOccurred())
893+
894+
missingProcssGroup = fdbv1beta2.NewProcessGroupStatus(cluster.GetNextRandomProcessGroupID(fdbv1beta2.ProcessClassLog, processGroupIDs[fdbv1beta2.ProcessClassLog]), fdbv1beta2.ProcessClassLog, nil)
895+
cluster.Status.ProcessGroups = append(cluster.Status.ProcessGroups, missingProcssGroup)
896+
// We have to set InSimulation here to false, otherwise the MissingProcess timestamp will be ignored.
897+
clusterReconciler.InSimulation = false
898+
})
899+
900+
It("should not exclude the process", func() {
901+
adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
902+
Expect(err).NotTo(HaveOccurred())
903+
904+
Expect(req).NotTo(BeNil())
905+
Expect(adminClient.ExcludedAddresses).To(BeEmpty())
906+
})
907+
908+
When("the transaction process is missing for more than 10 minutes", func() {
909+
BeforeEach(func() {
910+
missingProcssGroup.ProcessGroupConditions[0].Timestamp = time.Now().Add(-10 * time.Minute).Unix()
911+
})
912+
913+
It("should exclude the process", func() {
914+
adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
915+
Expect(err).NotTo(HaveOccurred())
916+
917+
Expect(req).To(BeNil())
918+
Expect(adminClient.ExcludedAddresses).To(HaveLen(2))
919+
})
920+
})
921+
})
922+
})
923+
})
833924
})
834925
})
835926

0 commit comments

Comments
 (0)