diff --git a/test/ote/util.go b/test/ote/util.go new file mode 100644 index 0000000000..7789f2a574 --- /dev/null +++ b/test/ote/util.go @@ -0,0 +1,169 @@ +package ote + +import ( + "context" + "fmt" + "time" + + configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" + "github.com/openshift/library-go/test/library" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// WaitForAPIServerRollout waits for all API server pods to be recreated and running +// after a configuration change. Unlike WaitForAPIServerToStabilizeOnTheSameRevision which +// waits for pods to converge on the same revision, this function specifically waits for +// NEW pods (created after the function is called) to replace the old ones. +// +// This is useful when you make a configuration change and need to ensure all pods have +// been recreated with the new configuration, not just that they're on the same revision. +// +// Parameters: +// - t: Logger interface for test output +// - podClient: Pod client interface for the target namespace +// - labelSelector: Label selector to identify API server pods (e.g., "apiserver=true") +// - timeout: Maximum time to wait for rollout to complete +// +// Returns: +// - error if timeout is reached or an error occurs during polling +// +// Note: +// - All existing pods must be replaced by new pods created after this function is called +// - Supports both single-node and multi-node deployments +func WaitForAPIServerRollout(t library.LoggingT, podClient corev1client.PodInterface, labelSelector string, timeout time.Duration) error { + rolloutStartTime := time.Now() + + // Get current pods before we start waiting + initialPods, err := podClient.List(context.Background(), metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + t.Logf("Warning: Could not get initial pods: %v", err) + } + + var oldestPodTime time.Time + initialRevision := "" + if initialPods != nil && len(initialPods.Items) > 0 { + oldestPodTime = initialPods.Items[0].CreationTimestamp.Time + for _, pod := range initialPods.Items { + if pod.CreationTimestamp.Time.Before(oldestPodTime) { + oldestPodTime = pod.CreationTimestamp.Time + } + if rev, ok := pod.Labels["revision"]; ok && initialRevision == "" { + initialRevision = rev + } + } + t.Logf("Initial state: %d pods, oldest created at %s, initial revision: %s", + len(initialPods.Items), oldestPodTime.Format(time.RFC3339), initialRevision) + } + + attempt := 0 + lastPodCount := 0 + lastNotRunningCount := 0 + + return wait.PollUntilContextTimeout(context.Background(), 15*time.Second, timeout, false, func(ctx context.Context) (bool, error) { + attempt++ + pods, err := podClient.List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + t.Logf("[Attempt %d] Error listing pods: %v", attempt, err) + return false, nil + } + + if len(pods.Items) == 0 { + t.Logf("[Attempt %d] No pods found yet", attempt) + return false, nil + } + + // Count pods and check if we have new pods (created after rollout started) + notRunningCount := 0 + newPodsCount := 0 + runningNewPodsCount := 0 + var notRunningPods []string + var currentRevision string + + for _, pod := range pods.Items { + isNewPod := pod.CreationTimestamp.Time.After(rolloutStartTime) + + if pod.Status.Phase != corev1.PodRunning { + notRunningCount++ + notRunningPods = append(notRunningPods, fmt.Sprintf("%s (%s)", pod.Name, pod.Status.Phase)) + } + + if isNewPod { + newPodsCount++ + if pod.Status.Phase == corev1.PodRunning { + runningNewPodsCount++ + } + } + + if rev, ok := pod.Labels["revision"]; ok && currentRevision == "" { + currentRevision = rev + } + } + + // Success condition: ALL pods must be new (created after rolloutStartTime) and running + expectedPodCount := len(pods.Items) + allPodsNewAndRunning := newPodsCount == expectedPodCount && runningNewPodsCount == expectedPodCount + + // Log only when state changes or every 4th attempt (1 minute) + if notRunningCount != lastNotRunningCount || len(pods.Items) != lastPodCount || attempt%4 == 0 { + if notRunningCount > 0 { + t.Logf("[Attempt %d] %d/%d pods running. Not running: %v. New pods: %d/%d running", + attempt, len(pods.Items)-notRunningCount, len(pods.Items), notRunningPods, runningNewPodsCount, newPodsCount) + } else { + t.Logf("[Attempt %d] All %d pods are running. New pods: %d/%d. Revision: %s", + attempt, len(pods.Items), runningNewPodsCount, newPodsCount, currentRevision) + } + lastPodCount = len(pods.Items) + lastNotRunningCount = notRunningCount + } + + return allPodsNewAndRunning, nil + }) +} + +// WaitForFeatureGateEnabled waits for a specific feature gate to be enabled in the cluster. +// +// This function polls the FeatureGate resource until the specified feature is found in the +// enabled list or the timeout is reached. +// +// Parameters: +// - t: Logger interface for test output +// - featureGateClient: FeatureGate client interface +// - featureName: Name of the feature gate to wait for (e.g., "EventTTL") +// - timeout: Maximum time to wait for the feature gate to be enabled +// +// Returns: +// - error if timeout is reached or an error occurs during polling +func WaitForFeatureGateEnabled(t library.LoggingT, featureGateClient configv1client.FeatureGateInterface, featureName string, timeout time.Duration) error { + t.Logf("Waiting for feature gate %s to be enabled (timeout: %v)", featureName, timeout) + attempt := 0 + + return wait.PollUntilContextTimeout(context.Background(), 10*time.Second, timeout, false, func(ctx context.Context) (bool, error) { + attempt++ + fg, err := featureGateClient.Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + t.Logf("[Attempt %d] Error getting feature gate: %v", attempt, err) + return false, nil + } + + for _, fgDetails := range fg.Status.FeatureGates { + for _, enabled := range fgDetails.Enabled { + if string(enabled.Name) == featureName { + t.Logf("[Attempt %d] Feature gate %s is enabled", attempt, featureName) + return true, nil + } + } + } + + if attempt%6 == 0 { // Log every minute + t.Logf("[Attempt %d] Feature gate %s not yet enabled, waiting...", attempt, featureName) + } + return false, nil + }) +}