Add OTE helper functions in test/library/ote

gangwgr · gangwgr · commit 79403812482e · 2025-11-19T16:59:48.000+05:30
Add two new OTE helper functions for common test scenarios:

  1. WaitForAPIServerRollout: Waits for all API server pods to be recreated
     after a configuration change. Unlike WaitForAPIServerToStabilizeOnTheSameRevision,
     this specifically waits for NEW pods to replace old ones.

  2. WaitForFeatureGateEnabled: Waits for a specific feature gate to be enabled
     in the cluster by polling the FeatureGate resource.

  These functions are needed for testing configuration changes and feature gate
  enablement in operator e2e tests, particularly for EventTTL configuration tests
  in cluster-kube-apiserver-operator.
diff --git a/test/ote/util.go b/test/ote/util.go
@@ -0,0 +1,169 @@
+package ote
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
+	"github.com/openshift/library-go/test/library"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
+)
+
+// WaitForAPIServerRollout waits for all API server pods to be recreated and running
+// after a configuration change. Unlike WaitForAPIServerToStabilizeOnTheSameRevision which
+// waits for pods to converge on the same revision, this function specifically waits for
+// NEW pods (created after the function is called) to replace the old ones.
+//
+// This is useful when you make a configuration change and need to ensure all pods have
+// been recreated with the new configuration, not just that they're on the same revision.
+//
+// Parameters:
+//   - t: Logger interface for test output
+//   - podClient: Pod client interface for the target namespace
+//   - labelSelector: Label selector to identify API server pods (e.g., "apiserver=true")
+//   - timeout: Maximum time to wait for rollout to complete
+//
+// Returns:
+//   - error if timeout is reached or an error occurs during polling
+//
+// Note:
+//   - All existing pods must be replaced by new pods created after this function is called
+//   - Supports both single-node and multi-node deployments
+func WaitForAPIServerRollout(t library.LoggingT, podClient corev1client.PodInterface, labelSelector string, timeout time.Duration) error {
+	rolloutStartTime := time.Now()
+
+	// Get current pods before we start waiting
+	initialPods, err := podClient.List(context.Background(), metav1.ListOptions{
+		LabelSelector: labelSelector,
+	})
+	if err != nil {
+		t.Logf("Warning: Could not get initial pods: %v", err)
+	}
+
+	var oldestPodTime time.Time
+	initialRevision := ""
+	if initialPods != nil && len(initialPods.Items) > 0 {
+		oldestPodTime = initialPods.Items[0].CreationTimestamp.Time
+		for _, pod := range initialPods.Items {
+			if pod.CreationTimestamp.Time.Before(oldestPodTime) {
+				oldestPodTime = pod.CreationTimestamp.Time
+			}
+			if rev, ok := pod.Labels["revision"]; ok && initialRevision == "" {
+				initialRevision = rev
+			}
+		}
+		t.Logf("Initial state: %d pods, oldest created at %s, initial revision: %s",
+			len(initialPods.Items), oldestPodTime.Format(time.RFC3339), initialRevision)
+	}
+
+	attempt := 0
+	lastPodCount := 0
+	lastNotRunningCount := 0
+
+	return wait.PollUntilContextTimeout(context.Background(), 15*time.Second, timeout, false, func(ctx context.Context) (bool, error) {
+		attempt++
+		pods, err := podClient.List(ctx, metav1.ListOptions{
+			LabelSelector: labelSelector,
+		})
+		if err != nil {
+			t.Logf("[Attempt %d] Error listing pods: %v", attempt, err)
+			return false, nil
+		}
+
+		if len(pods.Items) == 0 {
+			t.Logf("[Attempt %d] No pods found yet", attempt)
+			return false, nil
+		}
+
+		// Count pods and check if we have new pods (created after rollout started)
+		notRunningCount := 0
+		newPodsCount := 0
+		runningNewPodsCount := 0
+		var notRunningPods []string
+		var currentRevision string
+
+		for _, pod := range pods.Items {
+			isNewPod := pod.CreationTimestamp.Time.After(rolloutStartTime)
+
+			if pod.Status.Phase != corev1.PodRunning {
+				notRunningCount++
+				notRunningPods = append(notRunningPods, fmt.Sprintf("%s (%s)", pod.Name, pod.Status.Phase))
+			}
+
+			if isNewPod {
+				newPodsCount++
+				if pod.Status.Phase == corev1.PodRunning {
+					runningNewPodsCount++
+				}
+			}
+
+			if rev, ok := pod.Labels["revision"]; ok && currentRevision == "" {
+				currentRevision = rev
+			}
+		}
+
+		// Success condition: ALL pods must be new (created after rolloutStartTime) and running
+		expectedPodCount := len(pods.Items)
+		allPodsNewAndRunning := newPodsCount == expectedPodCount && runningNewPodsCount == expectedPodCount
+
+		// Log only when state changes or every 4th attempt (1 minute)
+		if notRunningCount != lastNotRunningCount || len(pods.Items) != lastPodCount || attempt%4 == 0 {
+			if notRunningCount > 0 {
+				t.Logf("[Attempt %d] %d/%d pods running. Not running: %v. New pods: %d/%d running",
+					attempt, len(pods.Items)-notRunningCount, len(pods.Items), notRunningPods, runningNewPodsCount, newPodsCount)
+			} else {
+				t.Logf("[Attempt %d] All %d pods are running. New pods: %d/%d. Revision: %s",
+					attempt, len(pods.Items), runningNewPodsCount, newPodsCount, currentRevision)
+			}
+			lastPodCount = len(pods.Items)
+			lastNotRunningCount = notRunningCount
+		}
+
+		return allPodsNewAndRunning, nil
+	})
+}
+
+// WaitForFeatureGateEnabled waits for a specific feature gate to be enabled in the cluster.
+//
+// This function polls the FeatureGate resource until the specified feature is found in the
+// enabled list or the timeout is reached.
+//
+// Parameters:
+//   - t: Logger interface for test output
+//   - featureGateClient: FeatureGate client interface
+//   - featureName: Name of the feature gate to wait for (e.g., "EventTTL")
+//   - timeout: Maximum time to wait for the feature gate to be enabled
+//
+// Returns:
+//   - error if timeout is reached or an error occurs during polling
+func WaitForFeatureGateEnabled(t library.LoggingT, featureGateClient configv1client.FeatureGateInterface, featureName string, timeout time.Duration) error {
+	t.Logf("Waiting for feature gate %s to be enabled (timeout: %v)", featureName, timeout)
+	attempt := 0
+
+	return wait.PollUntilContextTimeout(context.Background(), 10*time.Second, timeout, false, func(ctx context.Context) (bool, error) {
+		attempt++
+		fg, err := featureGateClient.Get(ctx, "cluster", metav1.GetOptions{})
+		if err != nil {
+			t.Logf("[Attempt %d] Error getting feature gate: %v", attempt, err)
+			return false, nil
+		}
+
+		for _, fgDetails := range fg.Status.FeatureGates {
+			for _, enabled := range fgDetails.Enabled {
+				if string(enabled.Name) == featureName {
+					t.Logf("[Attempt %d] Feature gate %s is enabled", attempt, featureName)
+					return true, nil
+				}
+			}
+		}
+
+		if attempt%6 == 0 { // Log every minute
+			t.Logf("[Attempt %d] Feature gate %s not yet enabled, waiting...", attempt, featureName)
+		}
+		return false, nil
+	})
+}