-
Notifications
You must be signed in to change notification settings - Fork 17
CLOUDP-316922 - Fix racy and slow auth tests like in openshift clusters #384
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3a66cf5
0713a2a
f062d2b
2b8bfc3
2ca951c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
import pymongo | ||
from kubetester import kubetester | ||
from kubetester.kubetester import KubernetesTester | ||
from kubetester.phase import Phase | ||
from opentelemetry import trace | ||
from pycognito import Cognito | ||
from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult | ||
|
@@ -76,6 +77,63 @@ def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: | |
return OIDCCallbackResult(access_token=u.id_token) | ||
|
||
|
||
def _wait_for_mongodbuser_reconciliation() -> None: | ||
""" | ||
Wait for ALL MongoDBUser resources in the namespace to be reconciled before attempting authentication. | ||
This prevents race conditions when passwords or user configurations have been recently changed. | ||
|
||
Lists all MongoDBUser resources in the namespace and waits for ALL of them to reach Updated phase. | ||
""" | ||
try: | ||
# Import inside function to avoid circular imports | ||
import kubernetes.client as client | ||
from kubetester.mongodb_user import MongoDBUser | ||
from tests.conftest import get_central_cluster_client | ||
|
||
namespace = KubernetesTester.get_namespace() | ||
api_client = client.CustomObjectsApi(api_client=get_central_cluster_client()) | ||
|
||
try: | ||
mongodb_users = api_client.list_namespaced_custom_object( | ||
group="mongodb.com", version="v1", namespace=namespace, plural="mongodbusers" | ||
) | ||
|
||
all_users = [] | ||
|
||
for user_item in mongodb_users.get("items", []): | ||
user_name = user_item.get("metadata", {}).get("name", "unknown") | ||
username = user_item.get("spec", {}).get("username", "unknown") | ||
all_users.append((user_name, username)) | ||
|
||
if not all_users: | ||
return | ||
|
||
logging.info( | ||
f"Found {len(all_users)} MongoDBUser resource(s) in namespace '{namespace}', waiting for all to reach Updated phase..." | ||
) | ||
|
||
for user_name, username in all_users: | ||
try: | ||
logging.info( | ||
f"Waiting for MongoDBUser '{user_name}' (username: {username}) to reach Updated phase..." | ||
) | ||
|
||
user = MongoDBUser(name=user_name, namespace=namespace) | ||
user.assert_reaches_phase(Phase.Updated, timeout=300) | ||
logging.info(f"MongoDBUser '{user_name}' reached Updated phase - reconciliation complete") | ||
|
||
except Exception as e: | ||
logging.warning(f"Failed to wait for MongoDBUser '{user_name}' reconciliation: {e}") | ||
# Continue with other users - don't fail the entire test | ||
|
||
logging.info("All MongoDBUser resources reconciliation check complete") | ||
|
||
except Exception as e: | ||
logging.warning(f"Failed to list MongoDBUser resources: {e} - proceeding without reconciliation wait") | ||
except Exception as e: | ||
logging.warning(f"Error while waiting for MongoDBUser reconciliation: {e} - proceeding with authentication") | ||
|
||
|
||
class MongoTester: | ||
"""MongoTester is a general abstraction to work with mongo database. It encapsulates the client created in | ||
the constructor. All general methods non-specific to types of mongodb topologies should reside here.""" | ||
|
@@ -115,7 +173,7 @@ def _init_client(self, **kwargs): | |
|
||
def assert_connectivity( | ||
self, | ||
attempts: int = 20, | ||
attempts: int = 50, | ||
db: str = "admin", | ||
col: str = "myCol", | ||
opts: Optional[List[Dict[str, any]]] = None, | ||
|
@@ -175,13 +233,17 @@ def assert_scram_sha_authentication( | |
username: str, | ||
password: str, | ||
auth_mechanism: str, | ||
attempts: int = 20, | ||
attempts: int = 50, | ||
ssl: bool = False, | ||
**kwargs, | ||
) -> None: | ||
assert attempts > 0 | ||
assert auth_mechanism in {"SCRAM-SHA-256", "SCRAM-SHA-1"} | ||
|
||
# Wait for ALL MongoDBUser resources to be reconciled before attempting authentication | ||
# This prevents race conditions when passwords have been recently changed | ||
_wait_for_mongodbuser_reconciliation() | ||
|
||
for i in reversed(range(attempts)): | ||
try: | ||
self._authenticate_with_scram( | ||
|
@@ -194,14 +256,15 @@ def assert_scram_sha_authentication( | |
return | ||
except OperationFailure as e: | ||
if i == 0: | ||
fail(msg=f"unable to authenticate after {attempts} attempts with error: {e}") | ||
fail(f"unable to authenticate after {attempts} attempts with error: {e}") | ||
|
||
time.sleep(5) | ||
|
||
def assert_scram_sha_authentication_fails( | ||
self, | ||
username: str, | ||
password: str, | ||
retries: int = 20, | ||
attempts: int = 50, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a lot of auth times we were really close to the 1m (20*5) mark. That is not a good timeout |
||
ssl: bool = False, | ||
**kwargs, | ||
): | ||
|
@@ -211,13 +274,16 @@ def assert_scram_sha_authentication_fails( | |
which still exists. When we change a password, we should eventually no longer be able to auth with | ||
that user's credentials. | ||
""" | ||
for i in range(retries): | ||
|
||
_wait_for_mongodbuser_reconciliation() | ||
|
||
for i in range(attempts): | ||
try: | ||
self._authenticate_with_scram(username, password, ssl=ssl, **kwargs) | ||
except OperationFailure: | ||
return | ||
time.sleep(5) | ||
fail(f"was still able to authenticate with username={username} password={password} after {retries} attempts") | ||
fail(f"was still able to authenticate with username={username} password={password} after {attempts} attempts") | ||
|
||
def _authenticate_with_scram( | ||
self, | ||
|
@@ -239,9 +305,11 @@ def _authenticate_with_scram( | |
# authentication doesn't actually happen until we interact with a database | ||
self.client["admin"]["myCol"].insert_one({}) | ||
|
||
def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, **kwargs): | ||
def assert_x509_authentication(self, cert_file_name: str, attempts: int = 50, **kwargs): | ||
assert attempts > 0 | ||
|
||
_wait_for_mongodbuser_reconciliation() | ||
|
||
options = self._merge_options( | ||
[ | ||
with_x509(cert_file_name, kwargs.get("tlsCAFile", kubetester.SSL_CA_CERT)), | ||
|
@@ -257,7 +325,8 @@ def assert_x509_authentication(self, cert_file_name: str, attempts: int = 20, ** | |
return | ||
except OperationFailure: | ||
if attempts == 0: | ||
fail(msg=f"unable to authenticate after {total_attempts} attempts") | ||
fail(f"unable to authenticate after {total_attempts} attempts") | ||
|
||
time.sleep(5) | ||
|
||
def assert_ldap_authentication( | ||
|
@@ -268,8 +337,9 @@ def assert_ldap_authentication( | |
collection: str = "myCol", | ||
tls_ca_file: Optional[str] = None, | ||
ssl_certfile: str = None, | ||
attempts: int = 20, | ||
attempts: int = 50, | ||
): | ||
_wait_for_mongodbuser_reconciliation() | ||
|
||
options = with_ldap(ssl_certfile, tls_ca_file) | ||
total_attempts = attempts | ||
|
@@ -289,17 +359,20 @@ def assert_ldap_authentication( | |
return | ||
except OperationFailure: | ||
if attempts <= 0: | ||
fail(msg=f"unable to authenticate after {total_attempts} attempts") | ||
fail(f"unable to authenticate after {total_attempts} attempts") | ||
|
||
time.sleep(5) | ||
|
||
def assert_oidc_authentication( | ||
self, | ||
db: str = "admin", | ||
collection: str = "myCol", | ||
attempts: int = 10, | ||
attempts: int = 50, | ||
): | ||
assert attempts > 0 | ||
|
||
_wait_for_mongodbuser_reconciliation() | ||
|
||
props = {"OIDC_CALLBACK": MyOIDCCallback()} | ||
|
||
total_attempts = attempts | ||
|
@@ -317,6 +390,7 @@ def assert_oidc_authentication( | |
except OperationFailure as e: | ||
if attempts == 0: | ||
raise RuntimeError(f"Unable to authenticate after {total_attempts} attempts: {e}") | ||
|
||
time.sleep(5) | ||
|
||
def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = "myCol", attempts: int = 10): | ||
|
@@ -326,7 +400,7 @@ def assert_oidc_authentication_fails(self, db: str = "admin", collection: str = | |
attempts -= 1 | ||
try: | ||
if attempts <= 0: | ||
fail(msg=f"was able to authenticate with OIDC after {total_attempts} attempts") | ||
fail(f"was able to authenticate with OIDC after {total_attempts} attempts") | ||
|
||
self.assert_oidc_authentication(db, collection, 1) | ||
time.sleep(5) | ||
|
@@ -362,7 +436,7 @@ def assert_deployment_reachable(self, attempts: int = 10): | |
if hosts_unreachable == 0: | ||
return | ||
if attempts <= 0: | ||
fail(msg="Some hosts still report NO_DATA state") | ||
fail("Some hosts still report NO_DATA state") | ||
time.sleep(10) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import re | ||
import time | ||
from base64 import b64decode | ||
from typing import Any, Callable, Dict, List, Optional | ||
from typing import Callable, Dict, List, Optional | ||
|
||
import kubernetes.client | ||
import requests | ||
|
@@ -13,7 +13,6 @@ | |
from kubetester import ( | ||
create_configmap, | ||
create_or_update_secret, | ||
read_configmap, | ||
read_secret, | ||
) | ||
from kubetester.automation_config_tester import AutomationConfigTester | ||
|
@@ -1029,6 +1028,8 @@ def assert_reaches_phase( | |
# This can be an intermediate error, right before we check for this secret we create it. | ||
# The cluster might just be slow | ||
"failed to locate the api key secret", | ||
# etcd might be slow | ||
"etcdserver: request timed out", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the reason that the multi reconcile test keeps failing is due to a large amount of appdb concurrent creation of configmaps (the automation config).
|
||
) | ||
|
||
start_time = time.time() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1684,6 +1684,26 @@ def pytest_sessionfinish(session, exitstatus): | |
ev = tester.get_project_events().json()["results"] | ||
with open(f"/tmp/diagnostics/{project_id}-events.json", "w", encoding="utf-8") as f: | ||
json.dump(ev, f, ensure_ascii=False, indent=4) | ||
|
||
if exitstatus != 0: | ||
try: | ||
automation_config_tester = tester.get_automation_config_tester() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a lot of times the ac would help a lot (especially in those auth change cases) |
||
automation_config = automation_config_tester.automation_config | ||
if not automation_config: | ||
continue | ||
|
||
# Remove mongoDbVersions field as it's too large | ||
if "mongoDbVersions" in automation_config: | ||
del automation_config["mongoDbVersions"] | ||
lucian-tosa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
with open( | ||
f"/tmp/diagnostics/{project_id}-automation-config.json", "w", encoding="utf-8" | ||
) as f: | ||
json.dump(automation_config, f, ensure_ascii=False, indent=4) | ||
|
||
logging.info(f"Saved automation config for project {project_id}") | ||
except Exception as e: | ||
logging.warning(f"Failed to collect automation config for project {project_id}: {e}") | ||
else: | ||
logging.info("om is not healthy - not collecting events information") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a kind of catch - all approach. Instead I could take the time to find all places where we update the secret/user and add this there. But I don't know where it is and might end up as a rabbit chase. I think that would be the correct approach - but I rather have this as a dedicated item instead.