-
Notifications
You must be signed in to change notification settings - Fork 13
Fix auth transition on edge-cases #321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 5 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,8 @@ | ||||||
--- | ||||||
title: Fixing auth transition edge-cases | ||||||
kind: fix | ||||||
date: 2025-08-08 | ||||||
--- | ||||||
|
||||||
* The agent returns ready if the cluster is ready to accept requests. The operator uses this information to continue operational actions like restarts. | ||||||
* This can be problematic during auth transitions. We can have a period where we invalidate one auth while the other is not activated yet and we try to use the not supported one | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
By this, are we talking about the thing that is discussed in the first point? If yes, should be just have one point to explain it well, seems both the points are related closely to each other. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
? |
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -85,12 +85,25 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin | |||||||||||||||||
|
||||||||||||||||||
goalsNotAchievedMap := map[string]int{} | ||||||||||||||||||
goalsAchievedMap := map[string]int{} | ||||||||||||||||||
authTransitionInProgress := map[string]string{} | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
? |
||||||||||||||||||
|
||||||||||||||||||
for _, p := range as.Processes { | ||||||||||||||||||
if !stringutil.Contains(relevantProcesses, p.Name) { | ||||||||||||||||||
continue | ||||||||||||||||||
} | ||||||||||||||||||
if p.LastGoalVersionAchieved == as.GoalVersion { | ||||||||||||||||||
goalsAchievedMap[p.Name] = p.LastGoalVersionAchieved | ||||||||||||||||||
|
||||||||||||||||||
// Check if authentication transitions are in the current plan | ||||||||||||||||||
// If a process has reached goal version but still has auth-related moves in plan, | ||||||||||||||||||
// it means authentication transition is likely in progress | ||||||||||||||||||
// The plan contains non-completed move names from the API | ||||||||||||||||||
Comment on lines
+97
to
+100
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
for _, move := range p.Plan { | ||||||||||||||||||
if isAuthenticationTransitionMove(move) { | ||||||||||||||||||
authTransitionInProgress[p.Name] = move | ||||||||||||||||||
break | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
} else { | ||||||||||||||||||
goalsNotAchievedMap[p.Name] = p.LastGoalVersionAchieved | ||||||||||||||||||
} | ||||||||||||||||||
|
@@ -103,6 +116,18 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin | |||||||||||||||||
goalsAchievedMsgList := slices.Collect(maps.Keys(goalsAchievedMap)) | ||||||||||||||||||
sort.Strings(goalsAchievedMsgList) | ||||||||||||||||||
|
||||||||||||||||||
// Check if any authentication transitions are in progress | ||||||||||||||||||
if len(authTransitionInProgress) > 0 { | ||||||||||||||||||
var authTransitionMsgList []string | ||||||||||||||||||
for processName, step := range authTransitionInProgress { | ||||||||||||||||||
authTransitionMsgList = append(authTransitionMsgList, fmt.Sprintf("%s:%s", processName, step)) | ||||||||||||||||||
} | ||||||||||||||||||
log.Infow("Authentication transitions still in progress, waiting for completion", | ||||||||||||||||||
"processes", authTransitionMsgList) | ||||||||||||||||||
return false, fmt.Sprintf("authentication transitions in progress for %d processes: %s", | ||||||||||||||||||
len(authTransitionInProgress), authTransitionMsgList) | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
if len(goalsNotAchievedMap) > 0 { | ||||||||||||||||||
return false, fmt.Sprintf("%d processes waiting to reach automation config goal state (version=%d): %s, %d processes reached goal state: %s", | ||||||||||||||||||
len(goalsNotAchievedMap), as.GoalVersion, goalsNotAchievedMsgList, len(goalsAchievedMsgList), goalsAchievedMsgList) | ||||||||||||||||||
|
@@ -113,17 +138,32 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin | |||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
// isAuthenticationTransitionMove returns true if the given move is related to authentication transitions | ||||||||||||||||||
func isAuthenticationTransitionMove(move string) bool { | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like the approach of moving those phase specific checks outside of the readiness probe where we can only afford to treat them with a very wide brush. |
||||||||||||||||||
authMoves := map[string]struct{}{ | ||||||||||||||||||
"RestartMongod": {}, | ||||||||||||||||||
"UpdateAuth": {}, | ||||||||||||||||||
"UpdateConfig": {}, | ||||||||||||||||||
"WaitForHealthy": {}, | ||||||||||||||||||
"InitiateReplSet": {}, | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
_, ok := authMoves[move] | ||||||||||||||||||
|
||||||||||||||||||
return ok | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
func areAnyAgentsInKubeUpgradeMode(as *AutomationStatus, relevantProcesses []string, log *zap.SugaredLogger) bool { | ||||||||||||||||||
for _, p := range as.Processes { | ||||||||||||||||||
if !stringutil.Contains(relevantProcesses, p.Name) { | ||||||||||||||||||
continue | ||||||||||||||||||
} | ||||||||||||||||||
for _, plan := range p.Plan { | ||||||||||||||||||
for _, planStep := range p.Plan { | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: In the code you introduced, you're ranging over |
||||||||||||||||||
// This means the following: | ||||||||||||||||||
// - the cluster is in static architecture | ||||||||||||||||||
// - the agents are in a dedicated upgrade process, waiting for their binaries to be replaced by kubernetes | ||||||||||||||||||
// - this can only happen if the statefulset is ready, therefore we are returning ready here | ||||||||||||||||||
if plan == automationAgentKubeUpgradePlan { | ||||||||||||||||||
if planStep == automationAgentKubeUpgradePlan { | ||||||||||||||||||
log.Debug("cluster is in changeVersionKube mode, returning the agent is ready.") | ||||||||||||||||||
return true | ||||||||||||||||||
} | ||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Are there are chances that user might get confused if it's K8s cluster or MongoDB cluster that we are talking about? Should we explicit and mention which cluster this is?