diff --git a/changelog/20250808_fix_fixing_auth_transition_edge_cases.md b/changelog/20250808_fix_fixing_auth_transition_edge_cases.md new file mode 100644 index 000000000..2d0b273d7 --- /dev/null +++ b/changelog/20250808_fix_fixing_auth_transition_edge_cases.md @@ -0,0 +1,8 @@ +--- +title: Fixing auth transition edge-cases +kind: fix +date: 2025-08-08 +--- + +* The agent returns ready if the cluster is ready to accept requests. The operator uses this information to continue operational actions like restarts. +* This can be problematic during auth transitions. We can have a period where we invalidate one auth while the other is not activated yet and we try to use the not supported one diff --git a/controllers/om/automation_status.go b/controllers/om/automation_status.go index c41316687..575a90470 100644 --- a/controllers/om/automation_status.go +++ b/controllers/om/automation_status.go @@ -85,12 +85,25 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin goalsNotAchievedMap := map[string]int{} goalsAchievedMap := map[string]int{} + authTransitionInProgress := map[string]string{} + for _, p := range as.Processes { if !stringutil.Contains(relevantProcesses, p.Name) { continue } if p.LastGoalVersionAchieved == as.GoalVersion { goalsAchievedMap[p.Name] = p.LastGoalVersionAchieved + + // Check if authentication transitions are in the current plan + // If a process has reached goal version but still has auth-related moves in plan, + // it means authentication transition is likely in progress + // The plan contains non-completed move names from the API + for _, move := range p.Plan { + if isAuthenticationTransitionMove(move) { + authTransitionInProgress[p.Name] = move + break + } + } } else { goalsNotAchievedMap[p.Name] = p.LastGoalVersionAchieved } @@ -103,6 +116,18 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin goalsAchievedMsgList := slices.Collect(maps.Keys(goalsAchievedMap)) sort.Strings(goalsAchievedMsgList) + // Check if any authentication transitions are in progress + if len(authTransitionInProgress) > 0 { + var authTransitionMsgList []string + for processName, step := range authTransitionInProgress { + authTransitionMsgList = append(authTransitionMsgList, fmt.Sprintf("%s:%s", processName, step)) + } + log.Infow("Authentication transitions still in progress, waiting for completion", + "processes", authTransitionMsgList) + return false, fmt.Sprintf("authentication transitions in progress for %d processes: %s", + len(authTransitionInProgress), authTransitionMsgList) + } + if len(goalsNotAchievedMap) > 0 { return false, fmt.Sprintf("%d processes waiting to reach automation config goal state (version=%d): %s, %d processes reached goal state: %s", len(goalsNotAchievedMap), as.GoalVersion, goalsNotAchievedMsgList, len(goalsAchievedMsgList), goalsAchievedMsgList) @@ -113,17 +138,29 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin } } +// isAuthenticationTransitionMove returns true if the given move is related to authentication transitions +func isAuthenticationTransitionMove(move string) bool { + authMoves := map[string]struct{}{ + "UpdateAuth": {}, + "WaitAuthUpdate": {}, + } + + _, ok := authMoves[move] + + return ok +} + func areAnyAgentsInKubeUpgradeMode(as *AutomationStatus, relevantProcesses []string, log *zap.SugaredLogger) bool { for _, p := range as.Processes { if !stringutil.Contains(relevantProcesses, p.Name) { continue } - for _, plan := range p.Plan { + for _, planStep := range p.Plan { // This means the following: // - the cluster is in static architecture // - the agents are in a dedicated upgrade process, waiting for their binaries to be replaced by kubernetes // - this can only happen if the statefulset is ready, therefore we are returning ready here - if plan == automationAgentKubeUpgradePlan { + if planStep == automationAgentKubeUpgradePlan { log.Debug("cluster is in changeVersionKube mode, returning the agent is ready.") return true } diff --git a/controllers/om/automation_status_test.go b/controllers/om/automation_status_test.go index 7ac7f3dcf..6337eb035 100644 --- a/controllers/om/automation_status_test.go +++ b/controllers/om/automation_status_test.go @@ -119,3 +119,132 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) { }) } } + +func TestCheckAutomationStatusIsGoal_AuthenticationTransitions(t *testing.T) { + logger := zap.NewNop().Sugar() + + tests := []struct { + name string + automationStatus *AutomationStatus + relevantProcesses []string + expectedReady bool + expectedMessage string + }{ + { + name: "should wait for UpdateAuth move to complete", + automationStatus: &AutomationStatus{ + GoalVersion: 5, + Processes: []ProcessStatus{ + { + Name: "rs0_0", + LastGoalVersionAchieved: 5, + Plan: []string{"UpdateAuth"}, + }, + }, + }, + relevantProcesses: []string{"rs0_0"}, + expectedReady: false, + expectedMessage: "authentication transitions in progress for 1 processes", + }, + { + name: "should be ready when authentication transitions are complete", + automationStatus: &AutomationStatus{ + GoalVersion: 5, + Processes: []ProcessStatus{ + { + Name: "rs0_0", + LastGoalVersionAchieved: 5, + Plan: []string{}, // Empty plan means all moves completed + }, + }, + }, + relevantProcesses: []string{"rs0_0"}, + expectedReady: true, + expectedMessage: "processes that reached goal state: [rs0_0]", + }, + { + name: "should wait for multiple processes with auth transitions", + automationStatus: &AutomationStatus{ + GoalVersion: 7, + Processes: []ProcessStatus{ + { + Name: "rs0_0", + LastGoalVersionAchieved: 7, + Plan: []string{}, // This process completed + }, + { + Name: "rs0_1", + LastGoalVersionAchieved: 7, + Plan: []string{"WaitAuthUpdate"}, // Auth-related move in progress + }, + }, + }, + relevantProcesses: []string{"rs0_0", "rs0_1"}, + expectedReady: false, + expectedMessage: "authentication transitions in progress for 1 processes", + }, + { + name: "should ignore non-authentication moves in progress", + automationStatus: &AutomationStatus{ + GoalVersion: 4, + Processes: []ProcessStatus{ + { + Name: "rs0_0", + LastGoalVersionAchieved: 4, + Plan: []string{"SomeOtherMove"}, // Non-auth move + }, + }, + }, + relevantProcesses: []string{"rs0_0"}, + expectedReady: true, + expectedMessage: "processes that reached goal state: [rs0_0]", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ready, message := checkAutomationStatusIsGoal( + tt.automationStatus, + tt.relevantProcesses, + logger, + ) + + assert.Equal(t, tt.expectedReady, ready, "Ready state should match expected") + assert.Contains(t, message, tt.expectedMessage, "Message should contain expected text") + + if tt.expectedReady { + t.Logf("✅ Process correctly marked as ready: %s", message) + } else { + t.Logf("⏳ Process correctly waiting for auth transition: %s", message) + } + }) + } +} + +func TestIsAuthenticationTransitionMove(t *testing.T) { + authMoves := []string{ + "UpdateAuth", + "WaitAuthUpdate", + } + + nonAuthMoves := []string{ + "SomeOtherMove", + "CreateIndex", + "DropCollection", + "BackupDatabase", + } + + for _, move := range authMoves { + t.Run("auth_move_"+move, func(t *testing.T) { + assert.True(t, isAuthenticationTransitionMove(move), + "Move %s should be recognized as authentication transition", move) + }) + } + + for _, move := range nonAuthMoves { + t.Run("non_auth_move_"+move, func(t *testing.T) { + assert.False(t, isAuthenticationTransitionMove(move), + "Move %s should not be recognized as authentication transition", move) + }) + } +}