Skip to content

Commit 9d6bb7d

Browse files
authored
Add additional saftey checks for bouncing and excluding processes (#1924)
1 parent fa1d181 commit 9d6bb7d

File tree

7 files changed

+419
-60
lines changed

7 files changed

+419
-60
lines changed

api/v1beta2/foundationdb_status.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@ type FoundationDBStatusClusterInfo struct {
129129

130130
// Messages represents the possible messages that are part of the cluster information.
131131
Messages []FoundationDBStatusMessage `json:"messages,omitempty"`
132+
133+
// BounceImpact represents the bounce_impact part of the machine-readable status.
134+
BounceImpact FoundationDBBounceImpact `json:"bounce_impact,omitempty"`
135+
}
136+
137+
// FoundationDBBounceImpact represents the bounce_impact part of the machine-readable status.
138+
type FoundationDBBounceImpact struct {
139+
CanCleanBounce *bool `json:"can_clean_bounce,omitempty"`
132140
}
133141

134142
// FaultTolerance provides information about the fault tolerance status

api/v1beta2/foundationdb_status_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package v1beta2
2222

2323
import (
2424
"encoding/json"
25+
"k8s.io/utils/pointer"
2526
"net"
2627
"os"
2728
"path/filepath"
@@ -906,6 +907,9 @@ var _ = Describe("FoundationDBStatus", func() {
906907
SecondsSinceLastRecovered: 76.8155,
907908
},
908909
Generation: 2,
910+
BounceImpact: FoundationDBBounceImpact{
911+
CanCleanBounce: pointer.Bool(true),
912+
},
909913
}
910914

911915
It("should parse all values correctly", func() {

api/v1beta2/zz_generated.deepcopy.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

controllers/bounce_processes.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,19 +78,25 @@ func (bounceProcesses) reconcile(ctx context.Context, r *FoundationDBClusterReco
7878

7979
logger.V(1).Info("processes that can be restarted", "addresses", addresses)
8080

81-
if minimumUptime < float64(cluster.GetMinimumUptimeSecondsForBounce()) {
82-
r.Recorder.Event(cluster, corev1.EventTypeNormal, "NeedsBounce",
83-
fmt.Sprintf("Spec require a bounce of some processes, but the cluster has only been up for %f seconds", minimumUptime))
81+
// Check if the cluster can safely bounce processes.
82+
err = fdbstatus.CanSafelyBounceProcesses(minimumUptime, float64(cluster.GetMinimumUptimeSecondsForBounce()), status)
83+
if err != nil {
84+
r.Recorder.Event(cluster, corev1.EventTypeNormal, "NeedsBounce", err.Error())
8485
cluster.Status.Generations.NeedsBounce = cluster.ObjectMeta.Generation
8586
err = r.updateOrApply(ctx, cluster)
8687
if err != nil {
8788
logger.Error(err, "Error updating cluster status")
8889
}
8990

90-
// Retry after we waited the minimum uptime
91+
// Retry after we waited the minimum uptime or at least 15 seconds.
92+
delayTime := cluster.GetMinimumUptimeSecondsForBounce() - int(minimumUptime)
93+
if delayTime < 15 {
94+
delayTime = 15
95+
}
96+
9197
return &requeue{
92-
message: "Cluster needs to stabilize before bouncing",
93-
delay: time.Second * time.Duration(cluster.GetMinimumUptimeSecondsForBounce()-int(minimumUptime)),
98+
message: err.Error(),
99+
delay: time.Second * time.Duration(delayTime),
94100
}
95101
}
96102

controllers/exclude_processes.go

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -66,57 +66,65 @@ func (e excludeProcesses) reconcile(_ context.Context, r *FoundationDBClusterRec
6666
logger.Info("current exclusions", "exclusions", exclusions)
6767
fdbProcessesToExcludeByClass, ongoingExclusionsByClass := getProcessesToExclude(exclusions, cluster)
6868

69-
if len(fdbProcessesToExcludeByClass) > 0 {
70-
var fdbProcessesToExclude []fdbv1beta2.ProcessAddress
71-
desiredProcesses, err := cluster.GetProcessCountsWithDefaults()
72-
if err != nil {
73-
return &requeue{curError: err, delayedRequeue: true}
74-
}
75-
76-
desiredProcessesMap := desiredProcesses.Map()
77-
for processClass := range fdbProcessesToExcludeByClass {
78-
contextLogger := logger.WithValues("processClass", processClass)
79-
ongoingExclusions := ongoingExclusionsByClass[processClass]
80-
processesToExclude := fdbProcessesToExcludeByClass[processClass]
69+
// No processes have to be excluded we can directly return.
70+
if len(fdbProcessesToExcludeByClass) == 0 {
71+
return nil
72+
}
8173

82-
allowedExclusions, missingProcesses := getAllowedExclusionsAndMissingProcesses(contextLogger, cluster, processClass, desiredProcessesMap[processClass], ongoingExclusions, r.InSimulation)
83-
if allowedExclusions <= 0 {
84-
contextLogger.Info("Waiting for missing processes before continuing with the exclusion", "missingProcesses", missingProcesses, "addressesToExclude", processesToExclude, "allowedExclusions", allowedExclusions, "ongoingExclusions", ongoingExclusions)
85-
continue
86-
}
74+
// Make sure it's safe to exclude processes.
75+
err = fdbstatus.CanSafelyExcludeProcesses(status)
76+
if err != nil {
77+
return &requeue{curError: err, delayedRequeue: true}
78+
}
8779

88-
// If we are not able to exclude all processes at once print a log message.
89-
if len(processesToExclude) > allowedExclusions {
90-
contextLogger.Info("Some processes are still missing but continuing with the exclusion", "missingProcesses", missingProcesses, "addressesToExclude", processesToExclude, "allowedExclusions", allowedExclusions, "ongoingExclusions", ongoingExclusions)
91-
}
80+
var fdbProcessesToExclude []fdbv1beta2.ProcessAddress
81+
desiredProcesses, err := cluster.GetProcessCountsWithDefaults()
82+
if err != nil {
83+
return &requeue{curError: err, delayedRequeue: true}
84+
}
9285

93-
if len(processesToExclude) < allowedExclusions {
94-
allowedExclusions = len(processesToExclude)
95-
}
86+
desiredProcessesMap := desiredProcesses.Map()
87+
for processClass := range fdbProcessesToExcludeByClass {
88+
contextLogger := logger.WithValues("processClass", processClass)
89+
ongoingExclusions := ongoingExclusionsByClass[processClass]
90+
processesToExclude := fdbProcessesToExcludeByClass[processClass]
9691

97-
// TODO: As a next step we could exclude transaction (log + stateless) processes together and exclude
98-
// storage processes with a separate call. This would make sure that no storage checks will block
99-
// the exclusion of transaction processes.
92+
allowedExclusions, missingProcesses := getAllowedExclusionsAndMissingProcesses(contextLogger, cluster, processClass, desiredProcessesMap[processClass], ongoingExclusions, r.InSimulation)
93+
if allowedExclusions <= 0 {
94+
contextLogger.Info("Waiting for missing processes before continuing with the exclusion", "missingProcesses", missingProcesses, "addressesToExclude", processesToExclude, "allowedExclusions", allowedExclusions, "ongoingExclusions", ongoingExclusions)
95+
continue
96+
}
10097

101-
// Add as many processes as allowed to the exclusion list.
102-
fdbProcessesToExclude = append(fdbProcessesToExclude, processesToExclude[:allowedExclusions]...)
98+
// If we are not able to exclude all processes at once print a log message.
99+
if len(processesToExclude) > allowedExclusions {
100+
contextLogger.Info("Some processes are still missing but continuing with the exclusion", "missingProcesses", missingProcesses, "addressesToExclude", processesToExclude, "allowedExclusions", allowedExclusions, "ongoingExclusions", ongoingExclusions)
103101
}
104102

105-
if len(fdbProcessesToExclude) == 0 {
106-
return &requeue{
107-
message: "more exclusions needed but not allowed, have to wait for new processes to come up",
108-
delayedRequeue: true,
109-
}
103+
if len(processesToExclude) < allowedExclusions {
104+
allowedExclusions = len(processesToExclude)
110105
}
111106

112-
r.Recorder.Event(cluster, corev1.EventTypeNormal, "ExcludingProcesses", fmt.Sprintf("Excluding %v", fdbProcessesToExclude))
107+
// TODO: As a next step we could exclude transaction (log + stateless) processes together and exclude
108+
// storage processes with a separate call. This would make sure that no storage checks will block
109+
// the exclusion of transaction processes.
113110

114-
err = adminClient.ExcludeProcesses(fdbProcessesToExclude)
115-
if err != nil {
116-
return &requeue{curError: err, delayedRequeue: true}
111+
// Add as many processes as allowed to the exclusion list.
112+
fdbProcessesToExclude = append(fdbProcessesToExclude, processesToExclude[:allowedExclusions]...)
113+
}
114+
115+
if len(fdbProcessesToExclude) == 0 {
116+
return &requeue{
117+
message: "more exclusions needed but not allowed, have to wait for new processes to come up",
118+
delayedRequeue: true,
117119
}
118120
}
119121

122+
r.Recorder.Event(cluster, corev1.EventTypeNormal, "ExcludingProcesses", fmt.Sprintf("Excluding %v", fdbProcessesToExclude))
123+
err = adminClient.ExcludeProcesses(fdbProcessesToExclude)
124+
if err != nil {
125+
return &requeue{curError: err, delayedRequeue: true}
126+
}
127+
120128
return nil
121129
}
122130

pkg/fdbstatus/status_checks.go

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -71,22 +71,9 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
7171
// the cluster status information as only the latest log processes will have the log process role. If we don't check
7272
// for the active generations we have the risk to remove a log process that still has mutations on it that must be
7373
// popped.
74-
if status.Cluster.RecoveryState.ActiveGenerations > 1 {
75-
logger.Info("Skipping exclusion check as there are multiple active generations", "activeGenerations", status.Cluster.RecoveryState.ActiveGenerations)
76-
return exclusionStatus{
77-
inProgress: nil,
78-
fullyExcluded: nil,
79-
notExcluded: addresses,
80-
missingInStatus: nil,
81-
}
82-
}
83-
84-
// If the database is unavailable the status might contain the processes but with missing role information. In this
85-
// case we should not perform any validation on the machine-readable status as this information might not be correct.
86-
// We don't want to run the exclude command to check for the status of the exclusions as this might result in a recovery
87-
// of the cluster or brings the cluster into a worse state.
88-
if !status.Client.DatabaseStatus.Available {
89-
logger.Info("Skipping exclusion check as the database is unavailable")
74+
err := DefaultSafetyChecks(status, 1, "check exclusion status")
75+
if err != nil {
76+
logger.Info("Skipping exclusion check as there are issues with the machine-readable status", "error", err.Error())
9077
return exclusionStatus{
9178
inProgress: nil,
9279
fullyExcluded: nil,
@@ -459,3 +446,51 @@ func HasDesiredFaultToleranceFromStatus(log logr.Logger, status *fdbv1beta2.Foun
459446

460447
return true
461448
}
449+
450+
// DefaultSafetyChecks performs a set of default safety checks, e.g. it checks if the cluster is available from the
451+
// client perspective and it checks that there are not too many active generations.
452+
func DefaultSafetyChecks(status *fdbv1beta2.FoundationDBStatus, maximumActiveGenerations int, action string) error {
453+
// If there are more than 10 active generations we should not allow the cluster to bounce processes as this could
454+
// cause another recovery increasing the active generations. In general the active generations should be at 1 during
455+
// normal operations.
456+
if status.Cluster.RecoveryState.ActiveGenerations > maximumActiveGenerations {
457+
return fmt.Errorf("cluster has %d active generations, but only %d active generations are allowed to safely %s", status.Cluster.RecoveryState.ActiveGenerations, maximumActiveGenerations, action)
458+
}
459+
460+
// If the database is unavailable we shouldn't perform any action on the cluster.
461+
if !status.Client.DatabaseStatus.Available {
462+
return fmt.Errorf("cluster is unavailable, cannot %s", action)
463+
}
464+
465+
return nil
466+
}
467+
468+
// CanSafelyBounceProcesses returns nil when it is safe to do a bounce on the cluster or returns an error with more information
469+
// why it's not safe to bounce processes in the cluster.
470+
func CanSafelyBounceProcesses(currentUptime float64, minimumUptime float64, status *fdbv1beta2.FoundationDBStatus) error {
471+
err := DefaultSafetyChecks(status, 10, "bounce processes")
472+
if err != nil {
473+
return err
474+
}
475+
476+
// If the current uptime of the cluster is below the minimum uptime we should not allow to bounce processes. This is
477+
// a safeguard to reduce the risk of repeated bounces in a short timeframe.
478+
if currentUptime < minimumUptime {
479+
return fmt.Errorf("cluster has only been up for %.2f seconds, but must be up for %.2f seconds to safely bounce", minimumUptime, minimumUptime)
480+
}
481+
482+
// If the machine-readable status reports that a clean bounce is not possible, we shouldn't perform a bounce. This
483+
// value will be false if the cluster is not fully recovered:
484+
// https://github.com/apple/foundationdb/blob/7.3.29/fdbserver/Status.actor.cpp#L437-L448
485+
if status.Cluster.BounceImpact.CanCleanBounce != nil && !*status.Cluster.BounceImpact.CanCleanBounce {
486+
return fmt.Errorf("cannot perform a clean bounce based on cluster status, current recovery state: %s", status.Cluster.RecoveryState.Name)
487+
}
488+
489+
return nil
490+
}
491+
492+
// CanSafelyExcludeProcesses currently performs the DefaultSafetyChecks. In the future this check might be extended to
493+
// perform more specific checks.
494+
func CanSafelyExcludeProcesses(status *fdbv1beta2.FoundationDBStatus) error {
495+
return DefaultSafetyChecks(status, 10, "exclude processes")
496+
}

0 commit comments

Comments
 (0)