Skip to content

Commit 0ee61e2

Browse files
fix(e2e): wait for leader election & measure timing for better monitoring
TestClusterExtensionAfterOLMUpgrade was failing due to increased leader election timeouts, causing reconciliation checks to run before leadership was acquired. This fix ensures the test explicitly waits for leader election logs (`"successfully acquired lease"`) before verifying reconciliation. Additionally, the test now measures and logs the leader election duration to help monitor election timing.
1 parent c3a4406 commit 0ee61e2

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

catalogd/cmd/catalogd/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,9 @@ func main() {
235235
LeaderElectionID: "catalogd-operator-lock",
236236
// Recommended Leader Election values
237237
// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
238-
LeaseDuration: ptr.To(137 * time.Second),
239-
RenewDeadline: ptr.To(107 * time.Second),
240-
RetryPeriod: ptr.To(26 * time.Second),
238+
LeaseDuration: ptr.To(137 * time.Second), // Default: 15s
239+
RenewDeadline: ptr.To(107 * time.Second), // Default: 10s
240+
RetryPeriod: ptr.To(26 * time.Second), // Default: 2s
241241

242242
WebhookServer: webhookServer,
243243
Cache: cacheOptions,

cmd/operator-controller/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,9 @@ func main() {
235235
LeaderElectionID: "9c4404e7.operatorframework.io",
236236
// Recommended Leader Election values
237237
// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
238-
LeaseDuration: ptr.To(137 * time.Second),
239-
RenewDeadline: ptr.To(107 * time.Second),
240-
RetryPeriod: ptr.To(26 * time.Second),
238+
LeaseDuration: ptr.To(137 * time.Second), // Default: 15s
239+
RenewDeadline: ptr.To(107 * time.Second), // Default: 10s
240+
RetryPeriod: ptr.To(26 * time.Second), // Default: 2s
241241

242242
Cache: cacheOptions,
243243
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily

test/upgrade-e2e/post_upgrade_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
4040
t.Log("Wait for operator-controller deployment to be ready")
4141
managerPod := waitForDeployment(t, ctx, "operator-controller-controller-manager")
4242

43+
t.Log("Start measuring leader election time")
44+
// - Best case (new pod starts): ~1–5 seconds
45+
// - Average case (leader exists, but lease not expired): +/-26–52 seconds
46+
// - Worst case (leader was there but crashed): LeaseDuration (137s) + RetryPeriod (26s) +/- 163 secs
47+
leaderStartTime := time.Now()
48+
leaderElectionCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
49+
defer leaderCancel()
50+
51+
leaderSubstrings := []string{"successfully acquired lease"}
52+
leaderElected, err := watchPodLogsForSubstring(leaderElectionCtx, managerPod, "manager", leaderSubstrings...)
53+
require.NoError(t, err)
54+
require.True(t, leaderElected)
55+
leaderElectionDuration := time.Since(leaderStartTime)
56+
t.Logf("Leader election took %v seconds", leaderElectionDuration.Seconds())
57+
4358
t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it")
4459
// Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes
4560
logCtx, cancel := context.WithTimeout(ctx, time.Minute)

0 commit comments

Comments
 (0)