Skip to content

Fix auth transition on edge-cases #321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog/20250808_fix_fixing_auth_transition_edge_cases.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
title: Fixing auth transition edge-cases
kind: fix
date: 2025-08-08
---

* The agent returns ready if the cluster is ready to accept requests. The operator uses this information to continue operational actions like restarts.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The agent returns ready if the cluster is ready to accept requests.

nit: Are there are chances that user might get confused if it's K8s cluster or MongoDB cluster that we are talking about? Should we explicit and mention which cluster this is?

* This can be problematic during auth transitions. We can have a period where we invalidate one auth while the other is not activated yet and we try to use the not supported one
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be problematic during auth transitions.

By this, are we talking about the thing that is discussed in the first point? If yes, should be just have one point to explain it well, seems both the points are related closely to each other.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* This can be problematic during auth transitions. We can have a period where we invalidate one auth while the other is not activated yet and we try to use the not supported one
* This can be problematic during auth transitions. We can have a period where we invalidate one auth while the other is not activated yet and we try to use the not supported one.

?

44 changes: 42 additions & 2 deletions controllers/om/automation_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,25 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin

goalsNotAchievedMap := map[string]int{}
goalsAchievedMap := map[string]int{}
authTransitionInProgress := map[string]string{}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
authTransitionInProgress := map[string]string{}
authTransitionsInProgress := map[string]string{}

?


for _, p := range as.Processes {
if !stringutil.Contains(relevantProcesses, p.Name) {
continue
}
if p.LastGoalVersionAchieved == as.GoalVersion {
goalsAchievedMap[p.Name] = p.LastGoalVersionAchieved

// Check if authentication transitions are in the current plan
// If a process has reached goal version but still has auth-related moves in plan,
// it means authentication transition is likely in progress
// The plan contains non-completed move names from the API
Comment on lines +97 to +100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Check if authentication transitions are in the current plan
// If a process has reached goal version but still has auth-related moves in plan,
// it means authentication transition is likely in progress
// The plan contains non-completed move names from the API
// Check if authentication transitions are in the current plan.
// If a process has reached goal version but still has auth-related moves in plan,
// it means authentication transition is likely in progress.
// The plan contains non-completed move names from the API.

for _, move := range p.Plan {
if isAuthenticationTransitionMove(move) {
authTransitionInProgress[p.Name] = move
break
}
}
} else {
goalsNotAchievedMap[p.Name] = p.LastGoalVersionAchieved
}
Expand All @@ -103,6 +116,18 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
goalsAchievedMsgList := slices.Collect(maps.Keys(goalsAchievedMap))
sort.Strings(goalsAchievedMsgList)

// Check if any authentication transitions are in progress
if len(authTransitionInProgress) > 0 {
var authTransitionMsgList []string
for processName, step := range authTransitionInProgress {
authTransitionMsgList = append(authTransitionMsgList, fmt.Sprintf("%s:%s", processName, step))
}
log.Infow("Authentication transitions still in progress, waiting for completion",
"processes", authTransitionMsgList)
return false, fmt.Sprintf("authentication transitions in progress for %d processes: %s",
len(authTransitionInProgress), authTransitionMsgList)
}

if len(goalsNotAchievedMap) > 0 {
return false, fmt.Sprintf("%d processes waiting to reach automation config goal state (version=%d): %s, %d processes reached goal state: %s",
len(goalsNotAchievedMap), as.GoalVersion, goalsNotAchievedMsgList, len(goalsAchievedMsgList), goalsAchievedMsgList)
Expand All @@ -113,17 +138,32 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
}
}

// isAuthenticationTransitionMove returns true if the given move is related to authentication transitions
func isAuthenticationTransitionMove(move string) bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like the approach of moving those phase specific checks outside of the readiness probe where we can only afford to treat them with a very wide brush.

authMoves := map[string]struct{}{
"RestartMongod": {},
"UpdateAuth": {},
"UpdateConfig": {},
"WaitForHealthy": {},
"InitiateReplSet": {},
}

_, ok := authMoves[move]

return ok
}

func areAnyAgentsInKubeUpgradeMode(as *AutomationStatus, relevantProcesses []string, log *zap.SugaredLogger) bool {
for _, p := range as.Processes {
if !stringutil.Contains(relevantProcesses, p.Name) {
continue
}
for _, plan := range p.Plan {
for _, planStep := range p.Plan {
Copy link
Member

@mircea-cosbuc mircea-cosbuc Aug 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: In the code you introduced, you're ranging over p.Plan with move and this is refactored to use planStep instead. Let's stay consistent (I think we can use move since automation uses the same terminology).

// This means the following:
// - the cluster is in static architecture
// - the agents are in a dedicated upgrade process, waiting for their binaries to be replaced by kubernetes
// - this can only happen if the statefulset is ready, therefore we are returning ready here
if plan == automationAgentKubeUpgradePlan {
if planStep == automationAgentKubeUpgradePlan {
log.Debug("cluster is in changeVersionKube mode, returning the agent is ready.")
return true
}
Expand Down
148 changes: 148 additions & 0 deletions controllers/om/automation_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,151 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) {
})
}
}

func TestCheckAutomationStatusIsGoal_AuthenticationTransitions(t *testing.T) {
logger := zap.NewNop().Sugar()

tests := []struct {
name string
automationStatus *AutomationStatus
relevantProcesses []string
expectedReady bool
expectedMessage string
}{
{
name: "should wait for UpdateAuth move to complete",
automationStatus: &AutomationStatus{
GoalVersion: 5,
Processes: []ProcessStatus{
{
Name: "rs0_0",
LastGoalVersionAchieved: 5,
Plan: []string{"UpdateAuth"},
},
},
},
relevantProcesses: []string{"rs0_0"},
expectedReady: false,
expectedMessage: "authentication transitions in progress for 1 processes",
},
{
name: "should wait for InitiateReplSet move to complete",
automationStatus: &AutomationStatus{
GoalVersion: 3,
Processes: []ProcessStatus{
{
Name: "rs0_0",
LastGoalVersionAchieved: 3,
Plan: []string{"InitiateReplSet"},
},
},
},
relevantProcesses: []string{"rs0_0"},
expectedReady: false,
expectedMessage: "authentication transitions in progress for 1 processes",
},
{
name: "should be ready when authentication transitions are complete",
automationStatus: &AutomationStatus{
GoalVersion: 5,
Processes: []ProcessStatus{
{
Name: "rs0_0",
LastGoalVersionAchieved: 5,
Plan: []string{}, // Empty plan means all moves completed
},
},
},
relevantProcesses: []string{"rs0_0"},
expectedReady: true,
expectedMessage: "processes that reached goal state: [rs0_0]",
},
{
name: "should wait for multiple processes with auth transitions",
automationStatus: &AutomationStatus{
GoalVersion: 7,
Processes: []ProcessStatus{
{
Name: "rs0_0",
LastGoalVersionAchieved: 7,
Plan: []string{}, // This process completed
},
{
Name: "rs0_1",
LastGoalVersionAchieved: 7,
Plan: []string{"RestartMongod"}, // Auth-related move in progress
},
},
},
relevantProcesses: []string{"rs0_0", "rs0_1"},
expectedReady: false,
expectedMessage: "authentication transitions in progress for 1 processes",
},
{
name: "should ignore non-authentication moves in progress",
automationStatus: &AutomationStatus{
GoalVersion: 4,
Processes: []ProcessStatus{
{
Name: "rs0_0",
LastGoalVersionAchieved: 4,
Plan: []string{"SomeOtherMove"}, // Non-auth move
},
},
},
relevantProcesses: []string{"rs0_0"},
expectedReady: true,
expectedMessage: "processes that reached goal state: [rs0_0]",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ready, message := checkAutomationStatusIsGoal(
tt.automationStatus,
tt.relevantProcesses,
logger,
)

assert.Equal(t, tt.expectedReady, ready, "Ready state should match expected")
assert.Contains(t, message, tt.expectedMessage, "Message should contain expected text")

if tt.expectedReady {
t.Logf("✅ Process correctly marked as ready: %s", message)
} else {
t.Logf("⏳ Process correctly waiting for auth transition: %s", message)
}
})
}
}

func TestIsAuthenticationTransitionMove(t *testing.T) {
authMoves := []string{
"RestartMongod",
"UpdateAuth",
"UpdateConfig",
"WaitForHealthy",
"InitiateReplSet",
}

nonAuthMoves := []string{
"SomeOtherMove",
"CreateIndex",
"DropCollection",
"BackupDatabase",
}

for _, move := range authMoves {
t.Run("auth_move_"+move, func(t *testing.T) {
assert.True(t, isAuthenticationTransitionMove(move),
"Move %s should be recognized as authentication transition", move)
})
}

for _, move := range nonAuthMoves {
t.Run("non_auth_move_"+move, func(t *testing.T) {
assert.False(t, isAuthenticationTransitionMove(move),
"Move %s should not be recognized as authentication transition", move)
})
}
}