diff --git a/.evergreen-functions.yml b/.evergreen-functions.yml index 90520daa4..5c7636210 100644 --- a/.evergreen-functions.yml +++ b/.evergreen-functions.yml @@ -456,6 +456,7 @@ functions: content_type: text/plain - command: attach.xunit_results params: + continue_on_err: true file: "src/github.com/mongodb/mongodb-kubernetes/logs/myreport.xml" upload_e2e_logs_gotest: diff --git a/docker/mongodb-kubernetes-tests/kubetester/mongodb_utils_state.py b/docker/mongodb-kubernetes-tests/kubetester/mongodb_utils_state.py index 5ee557445..5dee14799 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/mongodb_utils_state.py +++ b/docker/mongodb-kubernetes-tests/kubetester/mongodb_utils_state.py @@ -5,8 +5,10 @@ from kubetester.phase import Phase from opentelemetry import trace +from tests import test_logger TRACER = trace.get_tracer("evergreen-agent") +logger = test_logger.get_test_logger(__name__) @TRACER.start_as_current_span("in_desired_state") @@ -34,6 +36,9 @@ def in_desired_state( for event in intermediate_events: if event in current_message: found = True + logger.debug( + f"Found intermediate event in failure: {event} in {current_message}. Skipping the failure state" + ) if not found: raise AssertionError(f'Got into Failed phase while waiting for Running! ("{current_message}")') diff --git a/docker/mongodb-kubernetes-tests/kubetester/mongotester.py b/docker/mongodb-kubernetes-tests/kubetester/mongotester.py index 86ac0f75f..23f0d083a 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/mongotester.py +++ b/docker/mongodb-kubernetes-tests/kubetester/mongotester.py @@ -11,6 +11,7 @@ import pymongo from kubetester import kubetester from kubetester.kubetester import KubernetesTester +from kubetester.phase import Phase from opentelemetry import trace from pycognito import Cognito from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult @@ -76,6 +77,63 @@ def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: return OIDCCallbackResult(access_token=u.id_token) +def _wait_for_mongodbuser_reconciliation() -> None: + """ + Wait for ALL MongoDBUser resources in the namespace to be reconciled before attempting authentication. + This prevents race conditions when passwords or user configurations have been recently changed. + + Lists all MongoDBUser resources in the namespace and waits for ALL of them to reach Updated phase. + """ + try: + # Import inside function to avoid circular imports + import kubernetes.client as client + from kubetester.mongodb_user import MongoDBUser + from tests.conftest import get_central_cluster_client + + namespace = KubernetesTester.get_namespace() + api_client = client.CustomObjectsApi(api_client=get_central_cluster_client()) + + try: + mongodb_users = api_client.list_namespaced_custom_object( + group="mongodb.com", version="v1", namespace=namespace, plural="mongodbusers" + ) + + all_users = [] + + for user_item in mongodb_users.get("items", []): + user_name = user_item.get("metadata", {}).get("name", "unknown") + username = user_item.get("spec", {}).get("username", "unknown") + all_users.append((user_name, username)) + + if not all_users: + return + + logging.info( + f"Found {len(all_users)} MongoDBUser resource(s) in namespace '{namespace}', waiting for all to reach Updated phase..." + ) + + for user_name, username in all_users: + try: + logging.info( + f"Waiting for MongoDBUser '{user_name}' (username: {username}) to reach Updated phase..." + ) + + user = MongoDBUser(name=user_name, namespace=namespace) + user.assert_reaches_phase(Phase.Updated, timeout=300) + logging.info(f"MongoDBUser '{user_name}' reached Updated phase - reconciliation complete") + + except Exception as e: + logging.warning(f"Failed to wait for MongoDBUser '{user_name}' reconciliation: {e}") + # Continue with other users - don't fail the entire test + + logging.info("All MongoDBUser resources reconciliation check complete") + + except Exception as e: + logging.warning(f"Failed to list MongoDBUser resources: {e} - proceeding without reconciliation wait") + except Exception as e: + logging.warning(f"Error while waiting for MongoDBUser reconciliation: {e} - proceeding with authentication") + + class MongoTester: """MongoTester is a general abstraction to work with mongo database. It encapsulates the client created in the constructor. All general methods non-specific to types of mongodb topologies should reside here.""" @@ -115,7 +173,7 @@ def _init_client(self, **kwargs): def assert_connectivity( self, - attempts: int = 20, + attempts: int = 50, db: str = "admin", col: str = "myCol", opts: Optional[List[Dict[str, any]]] = None, @@ -175,13 +233,17 @@ def assert_scram_sha_authentication( username: str, password: str, auth_mechanism: str, - attempts: int = 20, + attempts: int = 50, ssl: bool = False, **kwargs, ) -> None: assert attempts > 0 assert auth_mechanism in {"SCRAM-SHA-256", "SCRAM-SHA-1"} + # Wait for ALL MongoDBUser resources to be reconciled before attempting authentication + # This prevents race conditions when passwords have been recently changed + _wait_for_mongodbuser_reconciliation() + for i in reversed(range(attempts)): try: self._authenticate_with_scram( @@ -194,14 +256,15 @@ def assert_scram_sha_authentication( return except OperationFailure as e: if i == 0: - fail(msg=f"unable to authenticate after {attempts} attempts with error: {e}") + fail(f"unable to authenticate after {attempts} attempts with error: {e}") + time.sleep(5) def assert_scram_sha_authentication_fails( self, username: str, password: str, - retries: int = 20, + attempts: int = 50, ssl: bool = False, **kwargs, ): @@ -211,13 +274,16 @@ def assert_scram_sha_authentication_fails( which still exists. When we change a password, we should eventually no longer be able to auth with that user's credentials. """ - for i in range(retries): + + _wait_for_mongodbuser_reconciliation() + + for i in range(attempts): try: self._authenticate_with_scram(username, password, ssl=ssl, **kwargs) except OperationFailure: return time.sleep(5) - fail(f"was still able to authenticate with username={username} password={password} after {retries} attempts") + fail(f"was still able to authenticate with username={username} password={password} after {attempts} attempts") def _authenticate_with_scram( self, @@ -239,9 +305,11 @@ def _authenticate_with_scram( # authentication doesn't actually happen until we interact with a database self.client["admin"]["myCol"].insert_one({}) - def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **kwargs): + def assert_x509_authentication(self, cert_file_name: str, attempts: int = 50, **kwargs): assert attempts > 0 + _wait_for_mongodbuser_reconciliation() + options = self._merge_options( [ with_x509(cert_file_name, kwargs.get("tlsCAFile", kubetester.SSL_CA_CERT)), @@ -257,7 +325,8 @@ def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, ** return except OperationFailure: if attempts == 0: - fail(msg=f"unable to authenticate after {total_attempts} attempts") + fail(f"unable to authenticate after {total_attempts} attempts") + time.sleep(5) def assert_ldap_authentication( @@ -268,8 +337,9 @@ def assert_ldap_authentication( collection: str = "myCol", tls_ca_file: Optional[str] = None, ssl_certfile: str = None, - attempts: int = 20, + attempts: int = 50, ): + _wait_for_mongodbuser_reconciliation() options = with_ldap(ssl_certfile, tls_ca_file) total_attempts = attempts @@ -289,17 +359,20 @@ def assert_ldap_authentication( return except OperationFailure: if attempts <= 0: - fail(msg=f"unable to authenticate after {total_attempts} attempts") + fail(f"unable to authenticate after {total_attempts} attempts") + time.sleep(5) def assert_oidc_authentication( self, db: str = "admin", collection: str = "myCol", - attempts: int = 10, + attempts: int = 50, ): assert attempts > 0 + _wait_for_mongodbuser_reconciliation() + props = {"OIDC_CALLBACK": MyOIDCCallback()} total_attempts = attempts @@ -317,6 +390,7 @@ def assert_oidc_authentication( except OperationFailure as e: if attempts == 0: raise RuntimeError(f"Unable to authenticate after {total_attempts} attempts: {e}") + time.sleep(5) def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = "myCol", attempts: int = 10): @@ -326,7 +400,7 @@ def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = attempts -= 1 try: if attempts <= 0: - fail(msg=f"was able to authenticate with OIDC after {total_attempts} attempts") + fail(f"was able to authenticate with OIDC after {total_attempts} attempts") self.assert_oidc_authentication(db, collection, 1) time.sleep(5) @@ -362,7 +436,7 @@ def assert_deployment_reachable(self, attempts: int = 10): if hosts_unreachable == 0: return if attempts <= 0: - fail(msg="Some hosts still report NO_DATA state") + fail("Some hosts still report NO_DATA state") time.sleep(10) diff --git a/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py b/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py index e413ddeee..a929c22ab 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py +++ b/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py @@ -4,7 +4,7 @@ import re import time from base64 import b64decode -from typing import Any, Callable, Dict, List, Optional +from typing import Callable, Dict, List, Optional import kubernetes.client import requests @@ -13,7 +13,6 @@ from kubetester import ( create_configmap, create_or_update_secret, - read_configmap, read_secret, ) from kubetester.automation_config_tester import AutomationConfigTester @@ -1029,6 +1028,8 @@ def assert_reaches_phase( # This can be an intermediate error, right before we check for this secret we create it. # The cluster might just be slow "failed to locate the api key secret", + # etcd might be slow + "etcdserver: request timed out", ) start_time = time.time() diff --git a/docker/mongodb-kubernetes-tests/tests/authentication/sharded_cluster_x509_to_scram_transition.py b/docker/mongodb-kubernetes-tests/tests/authentication/sharded_cluster_x509_to_scram_transition.py index c6298949f..45f7bfb88 100644 --- a/docker/mongodb-kubernetes-tests/tests/authentication/sharded_cluster_x509_to_scram_transition.py +++ b/docker/mongodb-kubernetes-tests/tests/authentication/sharded_cluster_x509_to_scram_transition.py @@ -106,7 +106,7 @@ class TestCanEnableScramSha256: @TRACER.start_as_current_span("test_can_enable_scram_sha_256") def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str): kubetester.wait_processes_ready() - sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800) + sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400) sharded_cluster.load() sharded_cluster["spec"]["security"]["authentication"]["enabled"] = True @@ -115,7 +115,7 @@ def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str): ] sharded_cluster["spec"]["security"]["authentication"]["agents"]["mode"] = "SCRAM" sharded_cluster.update() - sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800) + sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400) def test_assert_connectivity(self, ca_path: str): ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity(attempts=25) diff --git a/docker/mongodb-kubernetes-tests/tests/conftest.py b/docker/mongodb-kubernetes-tests/tests/conftest.py index 4a4c38e11..15366fc79 100644 --- a/docker/mongodb-kubernetes-tests/tests/conftest.py +++ b/docker/mongodb-kubernetes-tests/tests/conftest.py @@ -1684,6 +1684,26 @@ def pytest_sessionfinish(session, exitstatus): ev = tester.get_project_events().json()["results"] with open(f"/tmp/diagnostics/{project_id}-events.json", "w", encoding="utf-8") as f: json.dump(ev, f, ensure_ascii=False, indent=4) + + if exitstatus != 0: + try: + automation_config_tester = tester.get_automation_config_tester() + automation_config = automation_config_tester.automation_config + if not automation_config: + continue + + # Remove mongoDbVersions field as it's too large + if "mongoDbVersions" in automation_config: + del automation_config["mongoDbVersions"] + + with open( + f"/tmp/diagnostics/{project_id}-automation-config.json", "w", encoding="utf-8" + ) as f: + json.dump(automation_config, f, ensure_ascii=False, indent=4) + + logging.info(f"Saved automation config for project {project_id}") + except Exception as e: + logging.warning(f"Failed to collect automation config for project {project_id}: {e}") else: logging.info("om is not healthy - not collecting events information")