Ensure that the operator checks if the processes for newly created pods are up and running for the update pod config reconciler (#2244)

johscheuer · web-flow · commit 14e271f8163e · 2025-03-19T06:51:49.000Z
* Ensure that the operator checks if the processes for newly created pods are up and running for the update pod config reconciler
diff --git a/controllers/update_pods.go b/controllers/update_pods.go
@@ -24,6 +24,7 @@ import (
 	"context"
 	"fmt"
 	"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbadminclient"
+	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/utils/pointer"
 	"time"
 
@@ -43,7 +44,23 @@ type updatePods struct{}
 
 // reconcile runs the reconciler's work.
 func (u updatePods) reconcile(ctx context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, logger logr.Logger) *requeue {
-	updates, err := getPodsToUpdate(ctx, logger, r, cluster)
+	adminClient, err := r.getAdminClient(logger, cluster)
+	if err != nil {
+		return &requeue{curError: err, delayedRequeue: true}
+	}
+	defer func() {
+		_ = adminClient.Close()
+	}()
+
+	// If the status is not cached, we have to fetch it.
+	if status == nil {
+		status, err = adminClient.GetStatus()
+		if err != nil {
+			return &requeue{curError: err}
+		}
+	}
+
+	updates, err := getPodsToUpdate(ctx, logger, r, cluster, getProcessesByProcessGroup(cluster, status))
 	if err != nil {
 		return &requeue{curError: err, delay: podSchedulingDelayDuration, delayedRequeue: true}
 	}
@@ -65,22 +82,6 @@ func (u updatePods) reconcile(ctx context.Context, r *FoundationDBClusterReconci
 		return nil
 	}
 
-	adminClient, err := r.getAdminClient(logger, cluster)
-	if err != nil {
-		return &requeue{curError: err, delayedRequeue: true}
-	}
-	defer func() {
-		_ = adminClient.Close()
-	}()
-
-	// If the status is not cached, we have to fetch it.
-	if status == nil {
-		status, err = adminClient.GetStatus()
-		if err != nil {
-			return &requeue{curError: err}
-		}
-	}
-
 	return deletePodsForUpdates(ctx, r, cluster, updates, logger, status, adminClient)
 }
 
@@ -131,8 +132,32 @@ func getFaultDomainsWithUnavailablePods(ctx context.Context, logger logr.Logger,
 	return faultDomainsWithUnavailablePods
 }
 
+func getProcessesByProcessGroup(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) map[string][]fdbv1beta2.FoundationDBStatusProcessInfo {
+	processMap := map[string][]fdbv1beta2.FoundationDBStatusProcessInfo{}
+
+	for _, process := range status.Cluster.Processes {
+		if len(process.Locality) == 0 {
+			continue
+		}
+
+		processGroupID, ok := process.Locality[fdbv1beta2.FDBLocalityInstanceIDKey]
+		if !ok {
+			continue
+		}
+
+		// Ignore all processes for the process map that are for a different data center
+		if !cluster.ProcessSharesDC(process) {
+			continue
+		}
+
+		processMap[processGroupID] = append(processMap[processGroupID], process)
+	}
+
+	return processMap
+}
+
 // getPodsToUpdate returns a map of Zone to Pods mapping. The map has the fault domain as key and all Pods in that fault domain will be present as a slice of *corev1.Pod.
-func getPodsToUpdate(ctx context.Context, logger logr.Logger, reconciler *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster) (map[string][]*corev1.Pod, error) {
+func getPodsToUpdate(ctx context.Context, logger logr.Logger, reconciler *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, processInformation map[string][]fdbv1beta2.FoundationDBStatusProcessInfo) (map[string][]*corev1.Pod, error) {
 	updates := make(map[string][]*corev1.Pod)
 
 	faultDomainsWithUnavailablePods := getFaultDomainsWithUnavailablePods(ctx, logger, reconciler, cluster)
@@ -182,29 +207,50 @@ func getPodsToUpdate(ctx context.Context, logger logr.Logger, reconciler *Founda
 		pod, err := reconciler.PodLifecycleManager.GetPod(ctx, reconciler, cluster, processGroup.GetPodName(cluster))
 		// If a Pod is not found ignore it for now.
 		if err != nil {
-			logger.V(1).Info("Could not find Pod for process group ID",
-				"processGroupID", processGroup.ProcessGroupID)
+			if k8serrors.IsNotFound(err) {
+				logger.V(1).Info("Could not find Pod for process group ID",
+					"processGroupID", processGroup.ProcessGroupID)
+
+				// Check when the Pod went missing. If the condition is unset the current timestamp will be used, in that case
+				// the fdbv1beta2.MissingPod duration will be smaller than the 90 seconds buffer. The 90 seconds buffer
+				// was chosen as per default the failure detection in FDB takes 60 seconds to detect a failing fdbserver
+				// process (or actually to mark it failed). Without this check there could be a race condition where the
+				// Pod is already removed, so the process group would be skipped here but the fdbserver process is not yet
+				// marked as failed in FDB, which causes FDB to return full replication in the cluster status.
+				//
+				// With the unified image there is support for delaying the shutdown to reduce this risk even further.
+				missingPodDuration := time.Since(time.Unix(pointer.Int64Deref(processGroup.GetConditionTime(fdbv1beta2.MissingPod), time.Now().Unix()), 0))
+				if missingPodDuration < 90*time.Second {
+					podMissingError = fmt.Errorf("ProcessGroup: %s is missing the associated Pod for %s will be blocking until the Pod is missing for at least 90 seconds", processGroup.ProcessGroupID, missingPodDuration.String())
+				}
 
-			// Check when the Pod went missing. If the condition is unset the current timestamp will be used, in that case
-			// the fdbv1beta2.MissingPod duration will be smaller than the 90 seconds buffer. The 90 seconds buffer
-			// was chosen as per default the failure detection in FDB takes 60 seconds to detect a failing fdbserver
-			// process (or actually to mark it failed). Without this check there could be a race condition where the
-			// Pod is already removed, so the process group would be skipped here but the fdbserver process is not yet
-			// marked as failed in FDB, which causes FDB to return full replication in the cluster status.
-			//
-			// With the unified image there is support for delaying the shutdown to reduce this risk even further.
-			missingPodDuration := time.Since(time.Unix(pointer.Int64Deref(processGroup.GetConditionTime(fdbv1beta2.MissingPod), time.Now().Unix()), 0))
-			if missingPodDuration < 90*time.Second {
-				podMissingError = fmt.Errorf("ProcessGroup: %s is missing the associated Pod for %s will be blocking until the Pod is missing for at least 90 seconds", processGroup.ProcessGroupID, missingPodDuration.String())
+				continue
 			}
 
-			continue
+			return nil, err
 		}
 
 		if shouldRequeueDueToTerminatingPod(pod, cluster, processGroup.ProcessGroupID) {
 			return nil, fmt.Errorf("cluster has Pod %s that is pending deletion", pod.Name)
 		}
 
+		// If the pod was recently created, check if the processes are already running, if not return an error.
+		timeSincePodCreation := time.Since(pod.CreationTimestamp.Time)
+		if timeSincePodCreation < 1*time.Minute {
+			processes, ok := processInformation[string(processGroup.ProcessGroupID)]
+			if len(processes) == 0 || !ok {
+				return nil, fmt.Errorf("%s was recently created and the processes are not yet running", pod.Name)
+			}
+
+			for _, process := range processes {
+				// If the uptime is higher than the time since the pod was created, that means the reported process
+				// has some stale data. This could happen in cases where the status is cached in the operator.
+				if process.UptimeSeconds > timeSincePodCreation.Seconds() && !reconciler.InSimulation {
+					return nil, fmt.Errorf("%s was recently created but the process uptime reports old uptime, time since pod was created: %f.2 seconds and process up time: %f.2", pod.Name, timeSincePodCreation.Seconds(), process.UptimeSeconds)
+				}
+			}
+		}
+
 		specHash, err := internal.GetPodSpecHash(cluster, processGroup, nil)
 		if err != nil {
 			logger.Info("Skipping Pod due to error generating spec hash",
@@ -262,9 +308,6 @@ func getPodsToUpdate(ctx context.Context, logger logr.Logger, reconciler *Founda
 			zone = "simulation"
 		}
 
-		if updates[zone] == nil {
-			updates[zone] = make([]*corev1.Pod, 0)
-		}
 		updates[zone] = append(updates[zone], pod)
 	}
 
diff --git a/controllers/update_pods_test.go b/controllers/update_pods_test.go
@@ -25,6 +25,8 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbadminclient/mock"
+
 	"github.com/FoundationDB/fdb-kubernetes-operator/v2/internal"
 	ctrlClient "sigs.k8s.io/controller-runtime/pkg/client"
 
@@ -317,7 +319,7 @@ var _ = Describe("update_pods", func() {
 	When("fetching all Pods that needs an update", func() {
 		var cluster *fdbv1beta2.FoundationDBCluster
 		var updates map[string][]*corev1.Pod
-		var err error
+		var updateErr error
 
 		BeforeEach(func() {
 			cluster = internal.CreateDefaultCluster()
@@ -329,13 +331,18 @@ var _ = Describe("update_pods", func() {
 		})
 
 		JustBeforeEach(func() {
-			updates, err = getPodsToUpdate(context.Background(), globalControllerLogger, clusterReconciler, cluster)
+			adminClient, err := mock.NewMockAdminClient(cluster, k8sClient)
+			Expect(err).NotTo(HaveOccurred())
+			status, err := adminClient.GetStatus()
+			Expect(err).NotTo(HaveOccurred())
+
+			updates, updateErr = getPodsToUpdate(context.Background(), globalControllerLogger, clusterReconciler, cluster, getProcessesByProcessGroup(cluster, status))
 		})
 
 		When("the cluster has no changes", func() {
 			It("should return no errors and an empty map", func() {
 				Expect(updates).To(HaveLen(0))
-				Expect(err).NotTo(HaveOccurred())
+				Expect(updateErr).NotTo(HaveOccurred())
 			})
 
 			When("a Pod is missing", func() {
@@ -352,7 +359,7 @@ var _ = Describe("update_pods", func() {
 
 				It("should return no errors and an empty map", func() {
 					Expect(updates).To(HaveLen(0))
-					Expect(err).NotTo(HaveOccurred())
+					Expect(updateErr).NotTo(HaveOccurred())
 				})
 			})
 		})
@@ -368,7 +375,7 @@ var _ = Describe("update_pods", func() {
 			It("should return no errors and a map with one zone", func() {
 				// We only have one zone in this case, the simulation zone
 				Expect(updates).To(HaveLen(1))
-				Expect(err).NotTo(HaveOccurred())
+				Expect(updateErr).NotTo(HaveOccurred())
 			})
 
 			When("a Pod is missing", func() {
@@ -388,7 +395,7 @@ var _ = Describe("update_pods", func() {
 				When("the process group has no MissingPod condition", func() {
 					It("should return an error and an empty map", func() {
 						Expect(updates).To(HaveLen(0))
-						Expect(err).To(HaveOccurred())
+						Expect(updateErr).To(HaveOccurred())
 					})
 				})
 
@@ -400,7 +407,7 @@ var _ = Describe("update_pods", func() {
 
 					It("should return an error and an empty map", func() {
 						Expect(updates).To(HaveLen(0))
-						Expect(err).To(HaveOccurred())
+						Expect(updateErr).To(HaveOccurred())
 					})
 				})
 
@@ -412,7 +419,108 @@ var _ = Describe("update_pods", func() {
 
 					It("should return no error updates", func() {
 						Expect(updates).To(HaveLen(1))
+						Expect(updateErr).NotTo(HaveOccurred())
+					})
+				})
+			})
+
+			When("a Pod was recently created", func() {
+				var picked *fdbv1beta2.ProcessGroupStatus
+
+				BeforeEach(func() {
+					picked = internal.PickProcessGroups(cluster, fdbv1beta2.ProcessClassStorage, 1)[0]
+
+					podList := &corev1.PodList{}
+					Expect(k8sClient.List(context.Background(), podList, ctrlClient.InNamespace(cluster.Namespace), ctrlClient.MatchingLabels(cluster.GetMatchLabels()))).To(Succeed())
+
+					for _, pod := range podList.Items {
+						currentPod := pod.DeepCopy()
+						Expect(k8sClient.Delete(context.Background(), currentPod)).To(Succeed())
+
+						Expect(currentPod.Labels).NotTo(BeNil())
+						processGroupID, ok := currentPod.Labels[fdbv1beta2.FDBProcessGroupIDLabel]
+						Expect(ok).To(BeTrue())
+
+						var creationTimestamp metav1.Time
+						if processGroupID == string(picked.ProcessGroupID) {
+							creationTimestamp = metav1.NewTime(time.Now())
+						} else {
+							// Reset the metadata and ensure that all pods were created 24 hours ago
+							creationTimestamp = metav1.NewTime(time.Now().Add(-24 * time.Hour))
+						}
+
+						// Reset the metadata and ensure that all pods were created 24 hours ago
+						currentPod.ObjectMeta = metav1.ObjectMeta{
+							Name:              pod.Name,
+							Namespace:         pod.Namespace,
+							Annotations:       pod.Annotations,
+							Labels:            pod.Labels,
+							CreationTimestamp: creationTimestamp,
+						}
+
+						// Recreate Pod
+						Expect(k8sClient.Create(context.Background(), currentPod)).To(Succeed())
+
+					}
+
+					clusterReconciler.InSimulation = false
+				})
+
+				AfterEach(func() {
+					clusterReconciler.InSimulation = true
+				})
+
+				When("the process is not yet running", func() {
+					BeforeEach(func() {
+						adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
 						Expect(err).NotTo(HaveOccurred())
+						adminClient.MockMissingProcessGroup(picked.ProcessGroupID, true)
+					})
+
+					It("should return an error and an empty map", func() {
+						Expect(updates).To(HaveLen(0))
+						Expect(updateErr).To(MatchError(And(ContainSubstring("was recently created and the processes are not yet running"), ContainSubstring(string(picked.ProcessGroupID)))))
+					})
+				})
+
+				When("the process is running but the uptime seconds is greater than the pod uptime ", func() {
+					It("should return an error and an empty map", func() {
+						Expect(updates).To(HaveLen(0))
+						Expect(updateErr).To(MatchError(And(ContainSubstring("was recently created but the process uptime reports old uptime"), ContainSubstring(string(picked.ProcessGroupID)))))
+					})
+				})
+
+				When("the process is running and the uptime seconds is less than the pod uptime ", func() {
+					BeforeEach(func() {
+						pod := &corev1.Pod{}
+						Expect(k8sClient.Get(context.Background(), ctrlClient.ObjectKey{Name: picked.GetPodName(cluster), Namespace: cluster.Namespace}, pod)).To(Succeed())
+
+						pod.CreationTimestamp = metav1.NewTime(time.Now().Add(-6 * time.Hour))
+						Expect(k8sClient.Delete(context.Background(), pod)).To(Succeed())
+
+						creationTimestamp := time.Now().Add(-24 * time.Hour)
+						// We have to recreate the pod
+						pod.ObjectMeta = metav1.ObjectMeta{
+							Name:        pod.Name,
+							Namespace:   pod.Namespace,
+							Annotations: pod.Annotations,
+							Labels:      pod.Labels,
+							// Default uptime is 60000 seconds.
+							CreationTimestamp: metav1.NewTime(creationTimestamp),
+						}
+
+						// Recreate Pod
+						Expect(k8sClient.Create(context.Background(), pod)).To(Succeed())
+
+						newPod := &corev1.Pod{}
+						Expect(k8sClient.Get(context.Background(), ctrlClient.ObjectKey{Name: picked.GetPodName(cluster), Namespace: cluster.Namespace}, newPod)).To(Succeed())
+						Expect(newPod.CreationTimestamp.Time.Unix()).To(Equal(creationTimestamp.Unix()))
+
+					})
+
+					It("should return not error", func() {
+						Expect(updates).To(HaveLen(4))
+						Expect(updateErr).NotTo(HaveOccurred())
 					})
 				})
 			})
@@ -429,7 +537,7 @@ var _ = Describe("update_pods", func() {
 
 			It("should return no updates", func() {
 				Expect(updates).To(HaveLen(0))
-				Expect(err).NotTo(HaveOccurred())
+				Expect(updateErr).NotTo(HaveOccurred())
 			})
 		})
 
@@ -453,7 +561,7 @@ var _ = Describe("update_pods", func() {
 				It("should return no errors and a map with the zone and all pods to update", func() {
 					Expect(updates).To(HaveLen(1))
 					Expect(updates["simulation"]).To(HaveLen(4))
-					Expect(err).NotTo(HaveOccurred())
+					Expect(updateErr).NotTo(HaveOccurred())
 				})
 			})
 
@@ -470,7 +578,7 @@ var _ = Describe("update_pods", func() {
 				It("should return no errors and a map with the zone and two pods to update", func() {
 					Expect(updates).To(HaveLen(1))
 					Expect(updates["simulation"]).To(HaveLen(2))
-					Expect(err).NotTo(HaveOccurred())
+					Expect(updateErr).NotTo(HaveOccurred())
 				})
 			})
 
@@ -486,7 +594,7 @@ var _ = Describe("update_pods", func() {
 
 				It("should return no errors and a an empty update map", func() {
 					Expect(updates).To(HaveLen(0))
-					Expect(err).NotTo(HaveOccurred())
+					Expect(updateErr).NotTo(HaveOccurred())
 				})
 			})
 		})