Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .evergreen-functions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ functions:
content_type: text/plain
- command: attach.xunit_results
params:
continue_on_err: true
file: "src/github.com/mongodb/mongodb-kubernetes/logs/myreport.xml"

upload_e2e_logs_gotest:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

from kubetester.phase import Phase
from opentelemetry import trace
from tests import test_logger

TRACER = trace.get_tracer("evergreen-agent")
logger = test_logger.get_test_logger(__name__)


@TRACER.start_as_current_span("in_desired_state")
Expand Down Expand Up @@ -34,6 +36,9 @@ def in_desired_state(
for event in intermediate_events:
if event in current_message:
found = True
logger.debug(
f"Found intermediate event in failure: {event} in {current_message}. Skipping the failure state"
)

if not found:
raise AssertionError(f'Got into Failed phase while waiting for Running! ("{current_message}")')
Expand Down
100 changes: 87 additions & 13 deletions docker/mongodb-kubernetes-tests/kubetester/mongotester.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pymongo
from kubetester import kubetester
from kubetester.kubetester import KubernetesTester
from kubetester.phase import Phase
from opentelemetry import trace
from pycognito import Cognito
from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult
Expand Down Expand Up @@ -76,6 +77,63 @@ def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult:
return OIDCCallbackResult(access_token=u.id_token)


def _wait_for_mongodbuser_reconciliation() -> None:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a kind of catch - all approach. Instead I could take the time to find all places where we update the secret/user and add this there. But I don't know where it is and might end up as a rabbit chase. I think that would be the correct approach - but I rather have this as a dedicated item instead.

"""
Wait for ALL MongoDBUser resources in the namespace to be reconciled before attempting authentication.
This prevents race conditions when passwords or user configurations have been recently changed.

Lists all MongoDBUser resources in the namespace and waits for ALL of them to reach Updated phase.
"""
try:
# Import inside function to avoid circular imports
import kubernetes.client as client
from kubetester.mongodb_user import MongoDBUser
from tests.conftest import get_central_cluster_client

namespace = KubernetesTester.get_namespace()
api_client = client.CustomObjectsApi(api_client=get_central_cluster_client())

try:
mongodb_users = api_client.list_namespaced_custom_object(
group="mongodb.com", version="v1", namespace=namespace, plural="mongodbusers"
)

all_users = []

for user_item in mongodb_users.get("items", []):
user_name = user_item.get("metadata", {}).get("name", "unknown")
username = user_item.get("spec", {}).get("username", "unknown")
all_users.append((user_name, username))

if not all_users:
return

logging.info(
f"Found {len(all_users)} MongoDBUser resource(s) in namespace '{namespace}', waiting for all to reach Updated phase..."
)

for user_name, username in all_users:
try:
logging.info(
f"Waiting for MongoDBUser '{user_name}' (username: {username}) to reach Updated phase..."
)

user = MongoDBUser(name=user_name, namespace=namespace)
user.assert_reaches_phase(Phase.Updated, timeout=300)
logging.info(f"MongoDBUser '{user_name}' reached Updated phase - reconciliation complete")

except Exception as e:
logging.warning(f"Failed to wait for MongoDBUser '{user_name}' reconciliation: {e}")
# Continue with other users - don't fail the entire test

logging.info("All MongoDBUser resources reconciliation check complete")

except Exception as e:
logging.warning(f"Failed to list MongoDBUser resources: {e} - proceeding without reconciliation wait")
except Exception as e:
logging.warning(f"Error while waiting for MongoDBUser reconciliation: {e} - proceeding with authentication")


class MongoTester:
"""MongoTester is a general abstraction to work with mongo database. It encapsulates the client created in
the constructor. All general methods non-specific to types of mongodb topologies should reside here."""
Expand Down Expand Up @@ -115,7 +173,7 @@ def _init_client(self, **kwargs):

def assert_connectivity(
self,
attempts: int = 20,
attempts: int = 50,
db: str = "admin",
col: str = "myCol",
opts: Optional[List[Dict[str, any]]] = None,
Expand Down Expand Up @@ -175,13 +233,17 @@ def assert_scram_sha_authentication(
username: str,
password: str,
auth_mechanism: str,
attempts: int = 20,
attempts: int = 50,
ssl: bool = False,
**kwargs,
) -> None:
assert attempts > 0
assert auth_mechanism in {"SCRAM-SHA-256", "SCRAM-SHA-1"}

# Wait for ALL MongoDBUser resources to be reconciled before attempting authentication
# This prevents race conditions when passwords have been recently changed
_wait_for_mongodbuser_reconciliation()

for i in reversed(range(attempts)):
try:
self._authenticate_with_scram(
Expand All @@ -194,14 +256,15 @@ def assert_scram_sha_authentication(
return
except OperationFailure as e:
if i == 0:
fail(msg=f"unable to authenticate after {attempts} attempts with error: {e}")
fail(f"unable to authenticate after {attempts} attempts with error: {e}")

time.sleep(5)

def assert_scram_sha_authentication_fails(
self,
username: str,
password: str,
retries: int = 20,
attempts: int = 50,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a lot of auth times we were really close to the 1m (20*5) mark. That is not a good timeout

ssl: bool = False,
**kwargs,
):
Expand All @@ -211,13 +274,16 @@ def assert_scram_sha_authentication_fails(
which still exists. When we change a password, we should eventually no longer be able to auth with
that user's credentials.
"""
for i in range(retries):

_wait_for_mongodbuser_reconciliation()

for i in range(attempts):
try:
self._authenticate_with_scram(username, password, ssl=ssl, **kwargs)
except OperationFailure:
return
time.sleep(5)
fail(f"was still able to authenticate with username={username} password={password} after {retries} attempts")
fail(f"was still able to authenticate with username={username} password={password} after {attempts} attempts")

def _authenticate_with_scram(
self,
Expand All @@ -239,9 +305,11 @@ def _authenticate_with_scram(
# authentication doesn't actually happen until we interact with a database
self.client["admin"]["myCol"].insert_one({})

def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **kwargs):
def assert_x509_authentication(self, cert_file_name: str, attempts: int = 50, **kwargs):
assert attempts > 0

_wait_for_mongodbuser_reconciliation()

options = self._merge_options(
[
with_x509(cert_file_name, kwargs.get("tlsCAFile", kubetester.SSL_CA_CERT)),
Expand All @@ -257,7 +325,8 @@ def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **
return
except OperationFailure:
if attempts == 0:
fail(msg=f"unable to authenticate after {total_attempts} attempts")
fail(f"unable to authenticate after {total_attempts} attempts")

time.sleep(5)

def assert_ldap_authentication(
Expand All @@ -268,8 +337,9 @@ def assert_ldap_authentication(
collection: str = "myCol",
tls_ca_file: Optional[str] = None,
ssl_certfile: str = None,
attempts: int = 20,
attempts: int = 50,
):
_wait_for_mongodbuser_reconciliation()

options = with_ldap(ssl_certfile, tls_ca_file)
total_attempts = attempts
Expand All @@ -289,17 +359,20 @@ def assert_ldap_authentication(
return
except OperationFailure:
if attempts <= 0:
fail(msg=f"unable to authenticate after {total_attempts} attempts")
fail(f"unable to authenticate after {total_attempts} attempts")

time.sleep(5)

def assert_oidc_authentication(
self,
db: str = "admin",
collection: str = "myCol",
attempts: int = 10,
attempts: int = 50,
):
assert attempts > 0

_wait_for_mongodbuser_reconciliation()

props = {"OIDC_CALLBACK": MyOIDCCallback()}

total_attempts = attempts
Expand All @@ -317,6 +390,7 @@ def assert_oidc_authentication(
except OperationFailure as e:
if attempts == 0:
raise RuntimeError(f"Unable to authenticate after {total_attempts} attempts: {e}")

time.sleep(5)

def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = "myCol", attempts: int = 10):
Expand All @@ -326,7 +400,7 @@ def assert_oidc_authentication_fails(self, db: str = "admin", collection: str =
attempts -= 1
try:
if attempts <= 0:
fail(msg=f"was able to authenticate with OIDC after {total_attempts} attempts")
fail(f"was able to authenticate with OIDC after {total_attempts} attempts")

self.assert_oidc_authentication(db, collection, 1)
time.sleep(5)
Expand Down Expand Up @@ -362,7 +436,7 @@ def assert_deployment_reachable(self, attempts: int = 10):
if hosts_unreachable == 0:
return
if attempts <= 0:
fail(msg="Some hosts still report NO_DATA state")
fail("Some hosts still report NO_DATA state")
time.sleep(10)


Expand Down
5 changes: 3 additions & 2 deletions docker/mongodb-kubernetes-tests/kubetester/opsmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import time
from base64 import b64decode
from typing import Any, Callable, Dict, List, Optional
from typing import Callable, Dict, List, Optional

import kubernetes.client
import requests
Expand All @@ -13,7 +13,6 @@
from kubetester import (
create_configmap,
create_or_update_secret,
read_configmap,
read_secret,
)
from kubetester.automation_config_tester import AutomationConfigTester
Expand Down Expand Up @@ -1029,6 +1028,8 @@ def assert_reaches_phase(
# This can be an intermediate error, right before we check for this secret we create it.
# The cluster might just be slow
"failed to locate the api key secret",
# etcd might be slow
"etcdserver: request timed out",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the reason that the multi reconcile test keeps failing is due to a large amount of appdb concurrent creation of configmaps (the automation config).

https://spruce.mongodb.com/task/mongodb_kubernetes_e2e_operator_race_ubi_with_telemetry_e2e_om_reconcile_race_with_telemetry_patch_dd5f1d83bb18ee6c258effa7ef18f6b1841f1cc6_68b16aa2a26fc80007842f79_25_08_29_08_53_56/files?execution=1&sorts=STATUS%3AASC

("Error creating automation config map in cluster __default: etcdserver: request timed out")

)

start_time = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class TestCanEnableScramSha256:
@TRACER.start_as_current_span("test_can_enable_scram_sha_256")
def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str):
kubetester.wait_processes_ready()
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400)

sharded_cluster.load()
sharded_cluster["spec"]["security"]["authentication"]["enabled"] = True
Expand All @@ -115,7 +115,7 @@ def test_can_enable_scram_sha_256(self, sharded_cluster: MongoDB, ca_path: str):
]
sharded_cluster["spec"]["security"]["authentication"]["agents"]["mode"] = "SCRAM"
sharded_cluster.update()
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1400)

def test_assert_connectivity(self, ca_path: str):
ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity(attempts=25)
Expand Down
20 changes: 20 additions & 0 deletions docker/mongodb-kubernetes-tests/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1684,6 +1684,26 @@ def pytest_sessionfinish(session, exitstatus):
ev = tester.get_project_events().json()["results"]
with open(f"/tmp/diagnostics/{project_id}-events.json", "w", encoding="utf-8") as f:
json.dump(ev, f, ensure_ascii=False, indent=4)

if exitstatus != 0:
try:
automation_config_tester = tester.get_automation_config_tester()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a lot of times the ac would help a lot (especially in those auth change cases)

automation_config = automation_config_tester.automation_config
if not automation_config:
continue

# Remove mongoDbVersions field as it's too large
if "mongoDbVersions" in automation_config:
del automation_config["mongoDbVersions"]

with open(
f"/tmp/diagnostics/{project_id}-automation-config.json", "w", encoding="utf-8"
) as f:
json.dump(automation_config, f, ensure_ascii=False, indent=4)

logging.info(f"Saved automation config for project {project_id}")
except Exception as e:
logging.warning(f"Failed to collect automation config for project {project_id}: {e}")
else:
logging.info("om is not healthy - not collecting events information")

Expand Down