Skip to content

Commit 6b4107d

Browse files
authored
CLOUDP-316922 - Fix racy and slow auth tests like in openshift clusters (#384)
# Summary fixes: * e2e_sharded_cluster_scram_sha_256_user_connectivity * e2e_replica_set_scram_sha_256_user_connectivity both were mostly failing on master merges on openshift. PoW shows 3x passing in a row each one **Reliability improvements for authentication tests:** * Added a `_wait_for_mongodbuser_reconciliation` function to ensure all `MongoDBUser` resources reach the "Updated" phase before authentication attempts, preventing race conditions after user/password changes. This function is now called at the start of all authentication assertion methods * Increased the default number of authentication attempts 50 across all relevant methods * Some of the tests and their logs showed that we are updating the secret and sometimes the reconcile is slow to pick them up. In the meantime we already start the verification test which its default of around 100s. That is too short and racy. In this [log](https://operator-e2e-artifacts.s3.us-east-1.amazonaws.com/logs/mongodb_kubernetes_e2e_openshift_static_mdb_ubi_cloudqa_e2e_sharded_cluster_scram_sha_256_user_connectivity_3f4cd2b7bb80a72c09cb9871516d0ca6e0392efe_25_08_28_08_19_54/0/mongodb-kubernetes-operator-8844fcfbf-pfm4t-mongodb-kubernetes-operator.log?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Checksum-Mode=ENABLED&X-Amz-Credential=AKIAT5B2QITEKCB6K6KZ%2F20250828%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250828T145851Z&X-Amz-Expires=900&X-Amz-SignedHeaders=host&x-id=GetObject&X-Amz-Signature=6ff83a8aeb9846626203f3bac03fc3ba08077bc1887f56bd8cd7a7a8535cbb8d) one can see that the reconciliation of the user took around 1m but the auth verification already started before that, thus having a total run time larger than the 100s timeout **Error handling and diagnostics:** * fixed logging error, we were passing `msg` but that doesn't exist and thus we caused a panic when the test was failing - masking the error * Enhanced diagnostics collection in `tests/conftest.py` to also save the automation config JSON for each project when tests fail, aiding post-mortem analysis. ## Proof of Work - green ci - green [openshift](https://spruce.mongodb.com/version/68b086ba8764840007614ae0/tasks?sorts=STATUS%3AASC%3BBASE_STATUS%3ADESC) tests (passed multiple times in a row) ## Checklist - [x] Have you linked a jira ticket and/or is the ticket in the title? - [x] Have you checked whether your jira ticket required DOCSP changes? - [x] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details
1 parent dd5f1d8 commit 6b4107d

File tree

6 files changed

+118
-17
lines changed

6 files changed

+118
-17
lines changed

.evergreen-functions.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,7 @@ functions:
456456
content_type: text/plain
457457
- command: attach.xunit_results
458458
params:
459+
continue_on_err: true
459460
file: "src/github.com/mongodb/mongodb-kubernetes/logs/myreport.xml"
460461

461462
upload_e2e_logs_gotest:

docker/mongodb-kubernetes-tests/kubetester/mongodb_utils_state.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55

66
from kubetester.phase import Phase
77
from opentelemetry import trace
8+
from tests import test_logger
89

910
TRACER = trace.get_tracer("evergreen-agent")
11+
logger = test_logger.get_test_logger(__name__)
1012

1113

1214
@TRACER.start_as_current_span("in_desired_state")
@@ -34,6 +36,9 @@ def in_desired_state(
3436
for event in intermediate_events:
3537
if event in current_message:
3638
found = True
39+
logger.debug(
40+
f"Found intermediate event in failure: {event} in {current_message}. Skipping the failure state"
41+
)
3742

3843
if not found:
3944
raise AssertionError(f'Got into Failed phase while waiting for Running! ("{current_message}")')

docker/mongodb-kubernetes-tests/kubetester/mongotester.py

Lines changed: 87 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pymongo
1212
from kubetester import kubetester
1313
from kubetester.kubetester import KubernetesTester
14+
from kubetester.phase import Phase
1415
from opentelemetry import trace
1516
from pycognito import Cognito
1617
from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult
@@ -76,6 +77,63 @@ def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult:
7677
return OIDCCallbackResult(access_token=u.id_token)
7778

7879

80+
def _wait_for_mongodbuser_reconciliation() -> None:
81+
"""
82+
Wait for ALL MongoDBUser resources in the namespace to be reconciled before attempting authentication.
83+
This prevents race conditions when passwords or user configurations have been recently changed.
84+
85+
Lists all MongoDBUser resources in the namespace and waits for ALL of them to reach Updated phase.
86+
"""
87+
try:
88+
# Import inside function to avoid circular imports
89+
import kubernetes.client as client
90+
from kubetester.mongodb_user import MongoDBUser
91+
from tests.conftest import get_central_cluster_client
92+
93+
namespace = KubernetesTester.get_namespace()
94+
api_client = client.CustomObjectsApi(api_client=get_central_cluster_client())
95+
96+
try:
97+
mongodb_users = api_client.list_namespaced_custom_object(
98+
group="mongodb.com", version="v1", namespace=namespace, plural="mongodbusers"
99+
)
100+
101+
all_users = []
102+
103+
for user_item in mongodb_users.get("items", []):
104+
user_name = user_item.get("metadata", {}).get("name", "unknown")
105+
username = user_item.get("spec", {}).get("username", "unknown")
106+
all_users.append((user_name, username))
107+
108+
if not all_users:
109+
return
110+
111+
logging.info(
112+
f"Found {len(all_users)} MongoDBUser resource(s) in namespace '{namespace}', waiting for all to reach Updated phase..."
113+
)
114+
115+
for user_name, username in all_users:
116+
try:
117+
logging.info(
118+
f"Waiting for MongoDBUser '{user_name}' (username: {username}) to reach Updated phase..."
119+
)
120+
121+
user = MongoDBUser(name=user_name, namespace=namespace)
122+
user.assert_reaches_phase(Phase.Updated, timeout=300)
123+
logging.info(f"MongoDBUser '{user_name}' reached Updated phase - reconciliation complete")
124+
125+
except Exception as e:
126+
logging.warning(f"Failed to wait for MongoDBUser '{user_name}' reconciliation: {e}")
127+
# Continue with other users - don't fail the entire test
128+
129+
logging.info("All MongoDBUser resources reconciliation check complete")
130+
131+
except Exception as e:
132+
logging.warning(f"Failed to list MongoDBUser resources: {e} - proceeding without reconciliation wait")
133+
except Exception as e:
134+
logging.warning(f"Error while waiting for MongoDBUser reconciliation: {e} - proceeding with authentication")
135+
136+
79137
class MongoTester:
80138
"""MongoTester is a general abstraction to work with mongo database. It encapsulates the client created in
81139
the constructor. All general methods non-specific to types of mongodb topologies should reside here."""
@@ -115,7 +173,7 @@ def _init_client(self, **kwargs):
115173

116174
def assert_connectivity(
117175
self,
118-
attempts: int = 20,
176+
attempts: int = 50,
119177
db: str = "admin",
120178
col: str = "myCol",
121179
opts: Optional[List[Dict[str, any]]] = None,
@@ -175,13 +233,17 @@ def assert_scram_sha_authentication(
175233
username: str,
176234
password: str,
177235
auth_mechanism: str,
178-
attempts: int = 20,
236+
attempts: int = 50,
179237
ssl: bool = False,
180238
**kwargs,
181239
) -> None:
182240
assert attempts > 0
183241
assert auth_mechanism in {"SCRAM-SHA-256", "SCRAM-SHA-1"}
184242

243+
# Wait for ALL MongoDBUser resources to be reconciled before attempting authentication
244+
# This prevents race conditions when passwords have been recently changed
245+
_wait_for_mongodbuser_reconciliation()
246+
185247
for i in reversed(range(attempts)):
186248
try:
187249
self._authenticate_with_scram(
@@ -194,14 +256,15 @@ def assert_scram_sha_authentication(
194256
return
195257
except OperationFailure as e:
196258
if i == 0:
197-
fail(msg=f"unable to authenticate after {attempts} attempts with error: {e}")
259+
fail(f"unable to authenticate after {attempts} attempts with error: {e}")
260+
198261
time.sleep(5)
199262

200263
def assert_scram_sha_authentication_fails(
201264
self,
202265
username: str,
203266
password: str,
204-
retries: int = 20,
267+
attempts: int = 50,
205268
ssl: bool = False,
206269
**kwargs,
207270
):
@@ -211,13 +274,16 @@ def assert_scram_sha_authentication_fails(
211274
which still exists. When we change a password, we should eventually no longer be able to auth with
212275
that user's credentials.
213276
"""
214-
for i in range(retries):
277+
278+
_wait_for_mongodbuser_reconciliation()
279+
280+
for i in range(attempts):
215281
try:
216282
self._authenticate_with_scram(username, password, ssl=ssl, **kwargs)
217283
except OperationFailure:
218284
return
219285
time.sleep(5)
220-
fail(f"was still able to authenticate with username={username} password={password} after {retries} attempts")
286+
fail(f"was still able to authenticate with username={username} password={password} after {attempts} attempts")
221287

222288
def _authenticate_with_scram(
223289
self,
@@ -239,9 +305,11 @@ def _authenticate_with_scram(
239305
# authentication doesn't actually happen until we interact with a database
240306
self.client["admin"]["myCol"].insert_one({})
241307

242-
def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **kwargs):
308+
def assert_x509_authentication(self, cert_file_name: str, attempts: int = 50, **kwargs):
243309
assert attempts > 0
244310

311+
_wait_for_mongodbuser_reconciliation()
312+
245313
options = self._merge_options(
246314
[
247315
with_x509(cert_file_name, kwargs.get("tlsCAFile", kubetester.SSL_CA_CERT)),
@@ -257,7 +325,8 @@ def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **
257325
return
258326
except OperationFailure:
259327
if attempts == 0:
260-
fail(msg=f"unable to authenticate after {total_attempts} attempts")
328+
fail(f"unable to authenticate after {total_attempts} attempts")
329+
261330
time.sleep(5)
262331

263332
def assert_ldap_authentication(
@@ -268,8 +337,9 @@ def assert_ldap_authentication(
268337
collection: str = "myCol",
269338
tls_ca_file: Optional[str] = None,
270339
ssl_certfile: str = None,
271-
attempts: int = 20,
340+
attempts: int = 50,
272341
):
342+
_wait_for_mongodbuser_reconciliation()
273343

274344
options = with_ldap(ssl_certfile, tls_ca_file)
275345
total_attempts = attempts
@@ -289,17 +359,20 @@ def assert_ldap_authentication(
289359
return
290360
except OperationFailure:
291361
if attempts <= 0:
292-
fail(msg=f"unable to authenticate after {total_attempts} attempts")
362+
fail(f"unable to authenticate after {total_attempts} attempts")
363+
293364
time.sleep(5)
294365

295366
def assert_oidc_authentication(
296367
self,
297368
db: str = "admin",
298369
collection: str = "myCol",
299-
attempts: int = 10,
370+
attempts: int = 50,
300371
):
301372
assert attempts > 0
302373

374+
_wait_for_mongodbuser_reconciliation()
375+
303376
props = {"OIDC_CALLBACK": MyOIDCCallback()}
304377

305378
total_attempts = attempts
@@ -317,6 +390,7 @@ def assert_oidc_authentication(
317390
except OperationFailure as e:
318391
if attempts == 0:
319392
raise RuntimeError(f"Unable to authenticate after {total_attempts} attempts: {e}")
393+
320394
time.sleep(5)
321395

322396
def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = "myCol", attempts: int = 10):
@@ -326,7 +400,7 @@ def assert_oidc_authentication_fails(self, db: str = "admin", collection: str =
326400
attempts -= 1
327401
try:
328402
if attempts <= 0:
329-
fail(msg=f"was able to authenticate with OIDC after {total_attempts} attempts")
403+
fail(f"was able to authenticate with OIDC after {total_attempts} attempts")
330404

331405
self.assert_oidc_authentication(db, collection, 1)
332406
time.sleep(5)
@@ -362,7 +436,7 @@ def assert_deployment_reachable(self, attempts: int = 10):
362436
if hosts_unreachable == 0:
363437
return
364438
if attempts <= 0:
365-
fail(msg="Some hosts still report NO_DATA state")
439+
fail("Some hosts still report NO_DATA state")
366440
time.sleep(10)
367441

368442

docker/mongodb-kubernetes-tests/kubetester/opsmanager.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import re
55
import time
66
from base64 import b64decode
7-
from typing import Any, Callable, Dict, List, Optional
7+
from typing import Callable, Dict, List, Optional
88

99
import kubernetes.client
1010
import requests
@@ -13,7 +13,6 @@
1313
from kubetester import (
1414
create_configmap,
1515
create_or_update_secret,
16-
read_configmap,
1716
read_secret,
1817
)
1918
from kubetester.automation_config_tester import AutomationConfigTester
@@ -1029,6 +1028,8 @@ def assert_reaches_phase(
10291028
# This can be an intermediate error, right before we check for this secret we create it.
10301029
# The cluster might just be slow
10311030
"failed to locate the api key secret",
1031+
# etcd might be slow
1032+
"etcdserver: request timed out",
10321033
)
10331034

10341035
start_time = time.time()

docker/mongodb-kubernetes-tests/tests/authentication/sharded_cluster_x509_to_scram_transition.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class TestCanEnableScramSha256:
106106
@TRACER.start_as_current_span("test_can_enable_scram_sha_256")
107107
def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str):
108108
kubetester.wait_processes_ready()
109-
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
109+
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400)
110110

111111
sharded_cluster.load()
112112
sharded_cluster["spec"]["security"]["authentication"]["enabled"] = True
@@ -115,7 +115,7 @@ def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str):
115115
]
116116
sharded_cluster["spec"]["security"]["authentication"]["agents"]["mode"] = "SCRAM"
117117
sharded_cluster.update()
118-
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
118+
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400)
119119

120120
def test_assert_connectivity(self, ca_path: str):
121121
ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity(attempts=25)

docker/mongodb-kubernetes-tests/tests/conftest.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,6 +1684,26 @@ def pytest_sessionfinish(session, exitstatus):
16841684
ev = tester.get_project_events().json()["results"]
16851685
with open(f"/tmp/diagnostics/{project_id}-events.json", "w", encoding="utf-8") as f:
16861686
json.dump(ev, f, ensure_ascii=False, indent=4)
1687+
1688+
if exitstatus != 0:
1689+
try:
1690+
automation_config_tester = tester.get_automation_config_tester()
1691+
automation_config = automation_config_tester.automation_config
1692+
if not automation_config:
1693+
continue
1694+
1695+
# Remove mongoDbVersions field as it's too large
1696+
if "mongoDbVersions" in automation_config:
1697+
del automation_config["mongoDbVersions"]
1698+
1699+
with open(
1700+
f"/tmp/diagnostics/{project_id}-automation-config.json", "w", encoding="utf-8"
1701+
) as f:
1702+
json.dump(automation_config, f, ensure_ascii=False, indent=4)
1703+
1704+
logging.info(f"Saved automation config for project {project_id}")
1705+
except Exception as e:
1706+
logging.warning(f"Failed to collect automation config for project {project_id}: {e}")
16871707
else:
16881708
logging.info("om is not healthy - not collecting events information")
16891709

0 commit comments

Comments
 (0)