Fix auth transition on edge-cases (#321)

nammn · web-flow · commit 607507236564 · 2025-08-13T14:23:05.000+02:00
# Summary **Add Not-Ready Handling for Ongoing Auth Transitions**: This patch refines our readiness logic to correctly reflect the state of authentication transitions. Previously, we treated LastGoalVersionAchieved == GoalVersion as a signal that the cluster was "Running", but this assumption breaks down when auth transitions are still in progress. This happened because we returned "ready" during a wait step (WaitAuthCanUpdate) — and [we generally return ready for all wait steps](https://github.com/mongodb/mongodb-kubernetes/blob/f0050b8942545701e8cb9e42d54d14f0cb58ee6a/mongodb-community-operator/cmd/readiness/main.go#L139), regardless of whether auth is fully transitioned. Example status: ``` { "step": "WaitAuthUpdate", "stepDoc": "Wait to update Auth", "isWaitStep": true, "started": "2025-08-07T14:59:40.213178437Z", "attempts": 512, "latestAttempt": "2025-08-07T15:09:20.966699961Z", "completed": null, "result": "wait" } ``` **Why implemented in the operator and not readinessProbe**: I didn't fix the readinessProbe but rather the operator * if the readinessProbe blocks new nodes are not coming up * we want new nodes coming up * but we also want to block new configurations being applied, which the automation_status check in the operator does **The core idea:** * Configuration applied ≠ transition fully complete. **What happened in our tests**: * we update auth via CR x509 -> scram * `node-0` completed its auth transition (now uses scram, instead of x509) * `Config server` hasn't finished its auth transition yet * We hit a race condition where clusters were marked as "Running" too early and thus continued the rolling restart of `nod e-0` * `node-0` restarted with the old X509 config (see below comment from the agent code) * The X509 process couldn’t access the SCRAM automation user * Leads to Error: "process...doesn't have the automation user" - in the mms-automation there is also a comment; that indicates thats they are handling the edge-case if an auth transition was not successful, they start the process with old auth to "finish" it. But this is exactly what causes our race condition ``` // If a process went down unexpectedly in the middle of an auth transition, // we want to restart it with the old auth args. // Otherwise, it could be upgraded to the new auth state too soon, // and not be able to communicate with other shard members. ``` tl;dr: first `node-0` moved to new auth, `config` not yet, `node-0` restarted and during the restart `config` transitioned to the new auth while `node-0` is again running old auth ## Proof of Work - auth change tests are passing multiple times in a row: [Link](http://spruce.mongodb.com/version/6894b98218a2e90007437e99/tasks?sorts=STATUS%3AASC%3BBASE_STATUS%3ADESC) - the most flaky auth tests + [Link2](https://spruce.mongodb.com/task/mongodb_kubernetes_e2e_static_mdb_kind_ubi_cloudqa_e2e_sharded_cluster_x509_to_scram_transition_patch_b29fb4ace63eec7102f8f034fd6c553b5d75c1a1_6894c0785c119f0007a58f3c_25_08_07_15_04_26/logs?execution=0) - from the patch ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [x] Have you checked whether your jira ticket required DOCSP changes? - [x] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details
diff --git a/changelog/20250808_fix_fixing_auth_transition_edge_cases.md b/changelog/20250808_fix_fixing_auth_transition_edge_cases.md
@@ -0,0 +1,7 @@
+---
+title: Fixing auth transition edge-cases
+kind: fix
+date: 2025-08-08
+---
+
+* Fixed an issue where the readiness probe reported the node as ready even when its authentication mechanism was not in sync with the other nodes, potentially causing premature restarts.
diff --git a/controllers/om/automation_status.go b/controllers/om/automation_status.go
@@ -15,7 +15,7 @@ import (
 	"github.com/mongodb/mongodb-kubernetes/pkg/util/stringutil"
 )
 
-const automationAgentKubeUpgradePlan = "ChangeVersionKube"
+const automationAgentKubeUpgradeMove = "ChangeVersionKube"
 
 // AutomationStatus represents the status of automation agents registered with Ops Manager
 type AutomationStatus struct {
@@ -85,12 +85,25 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
 
 	goalsNotAchievedMap := map[string]int{}
 	goalsAchievedMap := map[string]int{}
+	authTransitionsInProgress := map[string]string{}
+
 	for _, p := range as.Processes {
 		if !stringutil.Contains(relevantProcesses, p.Name) {
 			continue
 		}
 		if p.LastGoalVersionAchieved == as.GoalVersion {
 			goalsAchievedMap[p.Name] = p.LastGoalVersionAchieved
+
+			// Check if authentication transitions are in the current plan.
+			// If a process has reached goal version but still has auth-related moves in plan,
+			// it means authentication transition is likely in progress.
+			// The plan contains non-completed move names from the API.
+			for _, move := range p.Plan {
+				if isAuthenticationTransitionMove(move) {
+					authTransitionsInProgress[p.Name] = move
+					break
+				}
+			}
 		} else {
 			goalsNotAchievedMap[p.Name] = p.LastGoalVersionAchieved
 		}
@@ -103,6 +116,18 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
 	goalsAchievedMsgList := slices.Collect(maps.Keys(goalsAchievedMap))
 	sort.Strings(goalsAchievedMsgList)
 
+	// Check if any authentication transitions are in progress
+	if len(authTransitionsInProgress) > 0 {
+		var authTransitionMsgList []string
+		for processName, step := range authTransitionsInProgress {
+			authTransitionMsgList = append(authTransitionMsgList, fmt.Sprintf("%s:%s", processName, step))
+		}
+		log.Infow("Authentication transitions still in progress, waiting for completion",
+			"processes", authTransitionMsgList)
+		return false, fmt.Sprintf("authentication transitions in progress for %d processes: %s",
+			len(authTransitionsInProgress), authTransitionMsgList)
+	}
+
 	if len(goalsNotAchievedMap) > 0 {
 		return false, fmt.Sprintf("%d processes waiting to reach automation config goal state (version=%d): %s, %d processes reached goal state: %s",
 			len(goalsNotAchievedMap), as.GoalVersion, goalsNotAchievedMsgList, len(goalsAchievedMsgList), goalsAchievedMsgList)
@@ -113,17 +138,29 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
 	}
 }
 
+// isAuthenticationTransitionMove returns true if the given move is related to authentication transitions
+func isAuthenticationTransitionMove(move string) bool {
+	authMoves := map[string]struct{}{
+		"UpdateAuth":     {},
+		"WaitAuthUpdate": {},
+	}
+
+	_, ok := authMoves[move]
+
+	return ok
+}
+
 func areAnyAgentsInKubeUpgradeMode(as *AutomationStatus, relevantProcesses []string, log *zap.SugaredLogger) bool {
 	for _, p := range as.Processes {
 		if !stringutil.Contains(relevantProcesses, p.Name) {
 			continue
 		}
-		for _, plan := range p.Plan {
+		for _, move := range p.Plan {
 			// This means the following:
 			// - the cluster is in static architecture
 			// - the agents are in a dedicated upgrade process, waiting for their binaries to be replaced by kubernetes
 			// - this can only happen if the statefulset is ready, therefore we are returning ready here
-			if plan == automationAgentKubeUpgradePlan {
+			if move == automationAgentKubeUpgradeMove {
 				log.Debug("cluster is in changeVersionKube mode, returning the agent is ready.")
 				return true
 			}
diff --git a/controllers/om/automation_status_test.go b/controllers/om/automation_status_test.go
@@ -75,7 +75,7 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) {
 						},
 						{
 							Name:                    "b",
-							Plan:                    []string{"FCV", automationAgentKubeUpgradePlan},
+							Plan:                    []string{"FCV", automationAgentKubeUpgradeMove},
 							LastGoalVersionAchieved: 1,
 						},
 					},
@@ -119,3 +119,132 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) {
 		})
 	}
 }
+
+func TestCheckAutomationStatusIsGoal_AuthenticationTransitions(t *testing.T) {
+	logger := zap.NewNop().Sugar()
+
+	tests := []struct {
+		name              string
+		automationStatus  *AutomationStatus
+		relevantProcesses []string
+		expectedReady     bool
+		expectedMessage   string
+	}{
+		{
+			name: "should wait for UpdateAuth move to complete",
+			automationStatus: &AutomationStatus{
+				GoalVersion: 5,
+				Processes: []ProcessStatus{
+					{
+						Name:                    "rs0_0",
+						LastGoalVersionAchieved: 5,
+						Plan:                    []string{"UpdateAuth"},
+					},
+				},
+			},
+			relevantProcesses: []string{"rs0_0"},
+			expectedReady:     false,
+			expectedMessage:   "authentication transitions in progress for 1 processes",
+		},
+		{
+			name: "should be ready when authentication transitions are complete",
+			automationStatus: &AutomationStatus{
+				GoalVersion: 5,
+				Processes: []ProcessStatus{
+					{
+						Name:                    "rs0_0",
+						LastGoalVersionAchieved: 5,
+						Plan:                    []string{}, // Empty plan means all moves completed
+					},
+				},
+			},
+			relevantProcesses: []string{"rs0_0"},
+			expectedReady:     true,
+			expectedMessage:   "processes that reached goal state: [rs0_0]",
+		},
+		{
+			name: "should wait for multiple processes with auth transitions",
+			automationStatus: &AutomationStatus{
+				GoalVersion: 7,
+				Processes: []ProcessStatus{
+					{
+						Name:                    "rs0_0",
+						LastGoalVersionAchieved: 7,
+						Plan:                    []string{}, // This process completed
+					},
+					{
+						Name:                    "rs0_1",
+						LastGoalVersionAchieved: 7,
+						Plan:                    []string{"WaitAuthUpdate"}, // Auth-related move in progress
+					},
+				},
+			},
+			relevantProcesses: []string{"rs0_0", "rs0_1"},
+			expectedReady:     false,
+			expectedMessage:   "authentication transitions in progress for 1 processes",
+		},
+		{
+			name: "should ignore non-authentication moves in progress",
+			automationStatus: &AutomationStatus{
+				GoalVersion: 4,
+				Processes: []ProcessStatus{
+					{
+						Name:                    "rs0_0",
+						LastGoalVersionAchieved: 4,
+						Plan:                    []string{"SomeOtherMove"}, // Non-auth move
+					},
+				},
+			},
+			relevantProcesses: []string{"rs0_0"},
+			expectedReady:     true,
+			expectedMessage:   "processes that reached goal state: [rs0_0]",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ready, message := checkAutomationStatusIsGoal(
+				tt.automationStatus,
+				tt.relevantProcesses,
+				logger,
+			)
+
+			assert.Equal(t, tt.expectedReady, ready, "Ready state should match expected")
+			assert.Contains(t, message, tt.expectedMessage, "Message should contain expected text")
+
+			if tt.expectedReady {
+				t.Logf("✅ Process correctly marked as ready: %s", message)
+			} else {
+				t.Logf("⏳ Process correctly waiting for auth transition: %s", message)
+			}
+		})
+	}
+}
+
+func TestIsAuthenticationTransitionMove(t *testing.T) {
+	authMoves := []string{
+		"UpdateAuth",
+		"WaitAuthUpdate",
+	}
+
+	nonAuthMoves := []string{
+		"SomeOtherMove",
+		"CreateIndex",
+		"DropCollection",
+		"BackupDatabase",
+	}
+
+	for _, move := range authMoves {
+		t.Run("auth_move_"+move, func(t *testing.T) {
+			assert.True(t, isAuthenticationTransitionMove(move),
+				"Move %s should be recognized as authentication transition", move)
+		})
+	}
+
+	for _, move := range nonAuthMoves {
+		t.Run("non_auth_move_"+move, func(t *testing.T) {
+			assert.False(t, isAuthenticationTransitionMove(move),
+				"Move %s should not be recognized as authentication transition", move)
+		})
+	}
+}