Skip to content

Commit f20d007

Browse files
committed
Merge branch 'maciejk/ar-image-staging' into maciejk/ar-image-release
# Conflicts: # scripts/release/build/build_info.py
2 parents 0219543 + 18582c7 commit f20d007

File tree

58 files changed

+348
-84
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+348
-84
lines changed

.evergreen-functions.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ functions:
336336
- command: shell.exec
337337
type: setup
338338
params:
339+
continue_on_err: true
339340
shell: bash
340341
working_dir: src/github.com/mongodb/mongodb-kubernetes
341342
script: |
@@ -420,6 +421,7 @@ functions:
420421
upload_e2e_logs:
421422
- command: s3.put
422423
params:
424+
continue_on_err: true
423425
aws_key: ${enterprise_aws_access_key_id}
424426
aws_secret: ${enterprise_aws_secret_access_key}
425427
local_files_include_filter:

.evergreen-periodic-builds.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@ variables:
1919
- func: switch_context
2020

2121
tasks:
22+
- name: periodic_teardown_aws
23+
commands:
24+
- func: cleanup_aws
25+
26+
- name: periodic_teardown_cloudqa
27+
commands:
28+
- func: teardown_cloud_qa_all
29+
30+
task_groups:
2231
- name: periodic_teardown_task_group
2332
<<: *setup_group
2433
tasks:

.evergreen.yml

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ variables:
118118
- func: setup_cloud_qa
119119
teardown_task_can_fail_task: true
120120
teardown_task:
121+
- func: teardown_cloud_qa
121122
- func: upload_e2e_logs
122123
- func: teardown_kubernetes_environment
123-
- func: teardown_cloud_qa
124124

125125
- &setup_and_teardown_task
126126
setup_task_can_fail_task: true
@@ -157,6 +157,25 @@ variables:
157157
- name: build_agent_images_ubi
158158
variant: init_test_run
159159

160+
- &base_om7_dependency_with_race
161+
depends_on:
162+
- name: build_om_images
163+
variant: build_om70_images
164+
- name: build_operator_race_ubi
165+
variant: init_test_run
166+
- name: build_init_database_image_ubi
167+
variant: init_test_run
168+
- name: build_database_image_ubi
169+
variant: init_test_run
170+
- name: build_test_image
171+
variant: init_test_run
172+
- name: build_init_appdb_images_ubi
173+
variant: init_test_run
174+
- name: build_init_om_images_ubi
175+
variant: init_test_run
176+
- name: build_agent_images_ubi
177+
variant: init_test_run
178+
160179
- &base_om8_dependency
161180
depends_on:
162181
- name: build_om_images
@@ -376,6 +395,14 @@ tasks:
376395
vars:
377396
image_name: operator
378397

398+
- name: build_operator_race_ubi
399+
commands:
400+
- func: clone
401+
- func: setup_building_host
402+
- func: pipeline
403+
vars:
404+
image_name: operator-race
405+
379406
- name: build_init_om_images_ubi
380407
commands:
381408
- func: clone
@@ -1323,7 +1350,7 @@ buildvariants:
13231350
tags: [ "e2e_test_suite" ]
13241351
run_on:
13251352
- ubuntu1804-xlarge
1326-
<<: *base_om7_dependency
1353+
<<: *base_om7_dependency_with_race
13271354
tasks:
13281355
- name: e2e_operator_race_with_telemetry_task_group
13291356

@@ -1518,6 +1545,7 @@ buildvariants:
15181545
- ubuntu2204-small
15191546
tasks:
15201547
- name: build_operator_ubi
1548+
- name: build_operator_race_ubi
15211549
- name: build_test_image
15221550
- name: build_mco_test_image
15231551
- name: build_init_appdb_images_ubi

build_info.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,22 @@
2424
]
2525
}
2626
},
27+
"operator-race": {
28+
"dockerfile-path": "docker/mongodb-kubernetes-operator/Dockerfile.atomic",
29+
"patch": {
30+
"repository": "268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/mongodb-kubernetes",
31+
"platforms": [
32+
"linux/amd64"
33+
]
34+
},
35+
"staging": {
36+
"sign": true,
37+
"repository": "268558157000.dkr.ecr.us-east-1.amazonaws.com/staging/mongodb-kubernetes",
38+
"platforms": [
39+
"linux/amd64"
40+
]
41+
}
42+
},
2743
"init-database": {
2844
"dockerfile-path": "docker/mongodb-kubernetes-init-database/Dockerfile.atomic",
2945
"patch": {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
title: Fixing auth transition edge-cases
3+
kind: fix
4+
date: 2025-08-08
5+
---
6+
7+
* Fixed an issue where the readiness probe reported the node as ready even when its authentication mechanism was not in sync with the other nodes, potentially causing premature restarts.

controllers/om/automation_status.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
"github.com/mongodb/mongodb-kubernetes/pkg/util/stringutil"
1616
)
1717

18-
const automationAgentKubeUpgradePlan = "ChangeVersionKube"
18+
const automationAgentKubeUpgradeMove = "ChangeVersionKube"
1919

2020
// AutomationStatus represents the status of automation agents registered with Ops Manager
2121
type AutomationStatus struct {
@@ -85,12 +85,25 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
8585

8686
goalsNotAchievedMap := map[string]int{}
8787
goalsAchievedMap := map[string]int{}
88+
authTransitionsInProgress := map[string]string{}
89+
8890
for _, p := range as.Processes {
8991
if !stringutil.Contains(relevantProcesses, p.Name) {
9092
continue
9193
}
9294
if p.LastGoalVersionAchieved == as.GoalVersion {
9395
goalsAchievedMap[p.Name] = p.LastGoalVersionAchieved
96+
97+
// Check if authentication transitions are in the current plan.
98+
// If a process has reached goal version but still has auth-related moves in plan,
99+
// it means authentication transition is likely in progress.
100+
// The plan contains non-completed move names from the API.
101+
for _, move := range p.Plan {
102+
if isAuthenticationTransitionMove(move) {
103+
authTransitionsInProgress[p.Name] = move
104+
break
105+
}
106+
}
94107
} else {
95108
goalsNotAchievedMap[p.Name] = p.LastGoalVersionAchieved
96109
}
@@ -103,6 +116,18 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
103116
goalsAchievedMsgList := slices.Collect(maps.Keys(goalsAchievedMap))
104117
sort.Strings(goalsAchievedMsgList)
105118

119+
// Check if any authentication transitions are in progress
120+
if len(authTransitionsInProgress) > 0 {
121+
var authTransitionMsgList []string
122+
for processName, step := range authTransitionsInProgress {
123+
authTransitionMsgList = append(authTransitionMsgList, fmt.Sprintf("%s:%s", processName, step))
124+
}
125+
log.Infow("Authentication transitions still in progress, waiting for completion",
126+
"processes", authTransitionMsgList)
127+
return false, fmt.Sprintf("authentication transitions in progress for %d processes: %s",
128+
len(authTransitionsInProgress), authTransitionMsgList)
129+
}
130+
106131
if len(goalsNotAchievedMap) > 0 {
107132
return false, fmt.Sprintf("%d processes waiting to reach automation config goal state (version=%d): %s, %d processes reached goal state: %s",
108133
len(goalsNotAchievedMap), as.GoalVersion, goalsNotAchievedMsgList, len(goalsAchievedMsgList), goalsAchievedMsgList)
@@ -113,17 +138,29 @@ func checkAutomationStatusIsGoal(as *AutomationStatus, relevantProcesses []strin
113138
}
114139
}
115140

141+
// isAuthenticationTransitionMove returns true if the given move is related to authentication transitions
142+
func isAuthenticationTransitionMove(move string) bool {
143+
authMoves := map[string]struct{}{
144+
"UpdateAuth": {},
145+
"WaitAuthUpdate": {},
146+
}
147+
148+
_, ok := authMoves[move]
149+
150+
return ok
151+
}
152+
116153
func areAnyAgentsInKubeUpgradeMode(as *AutomationStatus, relevantProcesses []string, log *zap.SugaredLogger) bool {
117154
for _, p := range as.Processes {
118155
if !stringutil.Contains(relevantProcesses, p.Name) {
119156
continue
120157
}
121-
for _, plan := range p.Plan {
158+
for _, move := range p.Plan {
122159
// This means the following:
123160
// - the cluster is in static architecture
124161
// - the agents are in a dedicated upgrade process, waiting for their binaries to be replaced by kubernetes
125162
// - this can only happen if the statefulset is ready, therefore we are returning ready here
126-
if plan == automationAgentKubeUpgradePlan {
163+
if move == automationAgentKubeUpgradeMove {
127164
log.Debug("cluster is in changeVersionKube mode, returning the agent is ready.")
128165
return true
129166
}

controllers/om/automation_status_test.go

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) {
7575
},
7676
{
7777
Name: "b",
78-
Plan: []string{"FCV", automationAgentKubeUpgradePlan},
78+
Plan: []string{"FCV", automationAgentKubeUpgradeMove},
7979
LastGoalVersionAchieved: 1,
8080
},
8181
},
@@ -119,3 +119,132 @@ func TestCheckAutomationStatusIsGoal(t *testing.T) {
119119
})
120120
}
121121
}
122+
123+
func TestCheckAutomationStatusIsGoal_AuthenticationTransitions(t *testing.T) {
124+
logger := zap.NewNop().Sugar()
125+
126+
tests := []struct {
127+
name string
128+
automationStatus *AutomationStatus
129+
relevantProcesses []string
130+
expectedReady bool
131+
expectedMessage string
132+
}{
133+
{
134+
name: "should wait for UpdateAuth move to complete",
135+
automationStatus: &AutomationStatus{
136+
GoalVersion: 5,
137+
Processes: []ProcessStatus{
138+
{
139+
Name: "rs0_0",
140+
LastGoalVersionAchieved: 5,
141+
Plan: []string{"UpdateAuth"},
142+
},
143+
},
144+
},
145+
relevantProcesses: []string{"rs0_0"},
146+
expectedReady: false,
147+
expectedMessage: "authentication transitions in progress for 1 processes",
148+
},
149+
{
150+
name: "should be ready when authentication transitions are complete",
151+
automationStatus: &AutomationStatus{
152+
GoalVersion: 5,
153+
Processes: []ProcessStatus{
154+
{
155+
Name: "rs0_0",
156+
LastGoalVersionAchieved: 5,
157+
Plan: []string{}, // Empty plan means all moves completed
158+
},
159+
},
160+
},
161+
relevantProcesses: []string{"rs0_0"},
162+
expectedReady: true,
163+
expectedMessage: "processes that reached goal state: [rs0_0]",
164+
},
165+
{
166+
name: "should wait for multiple processes with auth transitions",
167+
automationStatus: &AutomationStatus{
168+
GoalVersion: 7,
169+
Processes: []ProcessStatus{
170+
{
171+
Name: "rs0_0",
172+
LastGoalVersionAchieved: 7,
173+
Plan: []string{}, // This process completed
174+
},
175+
{
176+
Name: "rs0_1",
177+
LastGoalVersionAchieved: 7,
178+
Plan: []string{"WaitAuthUpdate"}, // Auth-related move in progress
179+
},
180+
},
181+
},
182+
relevantProcesses: []string{"rs0_0", "rs0_1"},
183+
expectedReady: false,
184+
expectedMessage: "authentication transitions in progress for 1 processes",
185+
},
186+
{
187+
name: "should ignore non-authentication moves in progress",
188+
automationStatus: &AutomationStatus{
189+
GoalVersion: 4,
190+
Processes: []ProcessStatus{
191+
{
192+
Name: "rs0_0",
193+
LastGoalVersionAchieved: 4,
194+
Plan: []string{"SomeOtherMove"}, // Non-auth move
195+
},
196+
},
197+
},
198+
relevantProcesses: []string{"rs0_0"},
199+
expectedReady: true,
200+
expectedMessage: "processes that reached goal state: [rs0_0]",
201+
},
202+
}
203+
204+
for _, tt := range tests {
205+
t.Run(tt.name, func(t *testing.T) {
206+
ready, message := checkAutomationStatusIsGoal(
207+
tt.automationStatus,
208+
tt.relevantProcesses,
209+
logger,
210+
)
211+
212+
assert.Equal(t, tt.expectedReady, ready, "Ready state should match expected")
213+
assert.Contains(t, message, tt.expectedMessage, "Message should contain expected text")
214+
215+
if tt.expectedReady {
216+
t.Logf("✅ Process correctly marked as ready: %s", message)
217+
} else {
218+
t.Logf("⏳ Process correctly waiting for auth transition: %s", message)
219+
}
220+
})
221+
}
222+
}
223+
224+
func TestIsAuthenticationTransitionMove(t *testing.T) {
225+
authMoves := []string{
226+
"UpdateAuth",
227+
"WaitAuthUpdate",
228+
}
229+
230+
nonAuthMoves := []string{
231+
"SomeOtherMove",
232+
"CreateIndex",
233+
"DropCollection",
234+
"BackupDatabase",
235+
}
236+
237+
for _, move := range authMoves {
238+
t.Run("auth_move_"+move, func(t *testing.T) {
239+
assert.True(t, isAuthenticationTransitionMove(move),
240+
"Move %s should be recognized as authentication transition", move)
241+
})
242+
}
243+
244+
for _, move := range nonAuthMoves {
245+
t.Run("non_auth_move_"+move, func(t *testing.T) {
246+
assert.False(t, isAuthenticationTransitionMove(move),
247+
"Move %s should not be recognized as authentication transition", move)
248+
})
249+
}
250+
}

scripts/dev/contexts/evg-private-context

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,8 @@ STAGING_REPO_URL="268558157000.dkr.ecr.us-east-1.amazonaws.com/staging"
3838
COMMIT_SHA_SHORT=$(git rev-parse --short=8 HEAD)
3939

4040
if [ "${is_patch:-false}" = "true" ]; then
41-
echo "is_patch is set, setting BASE_REPO_URL=${STAGING_REPO_URL}"
42-
export BASE_REPO_URL="${STAGING_REPO_URL}"
43-
export OVERRIDE_VERSION_ID="${COMMIT_SHA_SHORT}"
41+
echo "is_patch is set, setting BASE_REPO_URL=${DEV_REPO_URL}"
42+
export BASE_REPO_URL="${DEV_REPO_URL}"
4443
else
4544
echo "is_patch is not set, setting BASE_REPO_URL=${STAGING_REPO_URL}, OVERRIDE_VERSION_ID=${COMMIT_SHA_SHORT}"
4645
export BASE_REPO_URL="${STAGING_REPO_URL}"

scripts/release/atomic_pipeline.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,17 +119,20 @@ def build_mco_tests_image(build_configuration: ImageBuildConfiguration):
119119
)
120120

121121

122-
def build_operator_image(build_configuration: ImageBuildConfiguration):
122+
def build_operator_image(build_configuration: ImageBuildConfiguration, with_race_detection: bool = False):
123123
"""Calculates arguments required to build the operator image, and starts the build process."""
124124
# In evergreen, we can pass test_suffix env to publish the operator to a quay
125125
# repository with a given suffix.
126126
test_suffix = os.getenv("test_suffix", "")
127127
log_automation_config_diff = os.getenv("LOG_AUTOMATION_CONFIG_DIFF", "false")
128128

129+
build_configuration.version = f"{build_configuration.version}{'-race' if with_race_detection else ''}"
130+
129131
args = {
130132
"version": build_configuration.version,
131133
"log_automation_config_diff": log_automation_config_diff,
132134
"test_suffix": test_suffix,
135+
"use_race": "true" if with_race_detection else "false",
133136
}
134137

135138
logger.info(f"Building Operator args: {args}")

0 commit comments

Comments
 (0)