Skip to content

Commit cf59a35

Browse files
lucian-tosanammn
andauthored
[flakiness] improve flakiness and perf of multi_cluster restore test (#37)
# Summary this improves our third most flaky test (not 100% confident) - [HC](REDACTED) - some changes: - fail early if primary is not electable (no primary -> no backup) - use proper pit restore time instead of randomly choose a date, this sometimes led to errors like these - which won't happen anymore - it took around 40s now its immediately done; ``` [2025/03/03 13:24:01.103] Restoring back to the moment 15 seconds ago (millis): 1741004626102.5642 [2025/03/03 13:24:01.233] Encountered exception: Error sending request to Ops Manager API. 409 ({"detail":"Restore process initiation failed: The requested restore could not be started: Invalid restore point: Are you sure your backups were running at the time you selected?.","error":409,"errorCode":"RESTORE_INITIATION_FAILED","parameters":["The requested restore could not be started: Invalid restore point: Are you sure your backups were running at the time you selected?"],"reason":"Conflict"}). [2025/03/03 13:24:01.233] Request details: post https://om-backup.a-1741002799-ftz7steaq7z.interconnected:8443/api/public/v1.0/groups/67c59c69bb720856d1c0c3e1/clusters/67c59cb0bb720856d1c0cdce/restoreJobs (data: {'delivery': {'methodName': 'AUTOMATED_RESTORE', 'targetGroupId': '67c59c69bb720856d1c0c3e1', 'targetClusterId': '67c59cb0bb720856d1c0cdce'}, 'pointInTimeUTCMillis': 1741004626102.5642}) on retry number 0 ``` ## Proof of Work https://spruce.mongodb.com/task/ops_manager_kubernetes_e2e_multi_cluster_kind_e2e_multi_cluster_backup_restore_patch_b17acb360b9b2d9a598fe564ee2ced8f99dd7634_67c582286041a800070f3a6a_25_03_03_10_19_28/logs?execution=1&page=0&sorts=STATUS%3AASC%3BDURATION%3ADESC https://spruce.mongodb.com/task/ops_manager_kubernetes_e2e_multi_cluster_kind_e2e_multi_cluster_backup_restore_patch_b17acb360b9b2d9a598fe564ee2ced8f99dd7634_67c85db031d4ae0007515f56_25_03_05_14_20_34/logs?execution=0&sortBy=STATUS&sortDir=ASC ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [x] Have you checked whether your jira ticket required DOCSP changes? - [x] Have you checked for release_note changes? ## Reminder (Please remove this when merging) - Please try to Approve or Reject Changes the PR, keep PRs in review as short as possible - Our Short Guide for PRs: [Link](https://docs.google.com/document/d/1T93KUtdvONq43vfTfUt8l92uo4e4SEEvFbIEKOxGr44/edit?tab=t.0) - Remember the following Communication Standards - use comment prefixes for clarity: * **blocking**: Must be addressed before approval. * **follow-up**: Can be addressed in a later PR or ticket. * **q**: Clarifying question. * **nit**: Non-blocking suggestions. * **note**: Side-note, non-actionable. Example: Praise * --> no prefix is considered a question --------- Co-authored-by: Nam Nguyen <[email protected]>
1 parent 231e358 commit cf59a35

File tree

3 files changed

+65
-11
lines changed

3 files changed

+65
-11
lines changed

docker/mongodb-enterprise-tests/kubetester/omtester.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def __init__(self, om_context: OMContext):
106106
os.environ["OM_USER"] = self.context.user
107107
if self.context.base_url:
108108
os.environ["OM_HOST"] = self.context.base_url
109+
self.latest_backup_completion_time = None
109110

110111
def ensure_group_id(self):
111112
if self.context.project_id is None:
@@ -124,6 +125,12 @@ def create_restore_job_snapshot(self, snapshot_id: Optional[str] = None) -> str:
124125

125126
return self.api_create_restore_job_from_snapshot(cluster_id, snapshot_id)["id"]
126127

128+
def set_latest_backup_completion_time(self, timestamp):
129+
self.latest_backup_completion_time = timestamp
130+
131+
def get_latest_backup_completion_time(self):
132+
return self.latest_backup_completion_time or 0
133+
127134
def create_restore_job_pit(self, pit_milliseconds: int, retry: int = 120):
128135
"""creates a restore job to restore the mongodb cluster to some version specified by the parameter."""
129136
cluster_id = self.get_backup_cluster_id()
@@ -165,6 +172,13 @@ def wait_until_backup_snapshots_are_ready(
165172
print(f"Snapshots are ready, project: {self.context.group_name}, time: {time.time() - start_time} sec")
166173
span = trace.get_current_span()
167174
span.set_attribute(key="mck.snapshot_time", value=time.time() - start_time)
175+
completed_snapshots = [s for s in snapshots if s.get("complete", False)]
176+
latest_snapshot = max(completed_snapshots, key=lambda s: s["created"]["date"])
177+
snapshot_timestamp = latest_snapshot["created"]["date"]
178+
print(f"Current Backup Snapshots: {snapshots}")
179+
self.set_latest_backup_completion_time(
180+
time_to_millis(datetime.fromisoformat(snapshot_timestamp.replace("Z", "")))
181+
)
168182
return
169183
time.sleep(3)
170184
timeout -= 3
@@ -804,3 +818,10 @@ def should_include_tag(version: Optional[Dict[str, str]]) -> bool:
804818
return semver.compare(version_string, feature_controls_enabled_version) < 0
805819

806820
return True
821+
822+
823+
def time_to_millis(date_time) -> int:
824+
"""https://stackoverflow.com/a/11111177/614239"""
825+
epoch = datetime.utcfromtimestamp(0)
826+
pit_millis = (date_time - epoch).total_seconds() * 1000
827+
return pit_millis

docker/mongodb-enterprise-tests/tests/conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,6 +1662,25 @@ def install_multi_cluster_operator_cluster_scoped(
16621662
)
16631663

16641664

1665+
def wait_for_primary(mdb_client, timeout=500):
1666+
"""
1667+
Waits until a primary node is elected before proceeding with database operations.
1668+
This prevents unnecessary timeouts during restore actions (they will just fail otherwise).
1669+
"""
1670+
start_time = time.time()
1671+
while time.time() - start_time < timeout:
1672+
try:
1673+
status = mdb_client.admin.command("replSetGetStatus")
1674+
primary = [m for m in status["members"] if m["stateStr"] == "PRIMARY"]
1675+
if primary:
1676+
print(f"✅ Primary node detected: {primary[0]['name']}")
1677+
return True
1678+
except Exception as e:
1679+
print(f"⏳ Waiting for primary election... Error: {e}")
1680+
time.sleep(5)
1681+
raise Exception("❌ No primary found within timeout")
1682+
1683+
16651684
def assert_data_got_restored(test_data, collection1, collection2=None, timeout=300):
16661685
"""The data in the db has been restored to the initial state. Note, that this happens eventually - so
16671686
we need to loop for some time (usually takes 60 seconds max). This is different from restoring from a

docker/mongodb-enterprise-tests/tests/multicluster/multi_cluster_backup_restore.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,12 @@
2323
from kubetester.omtester import OMTester
2424
from kubetester.operator import Operator
2525
from kubetester.opsmanager import MongoDBOpsManager
26-
from pymongo.errors import ServerSelectionTimeoutError
2726
from pytest import fixture, mark
28-
from tests.conftest import assert_data_got_restored, update_coredns_hosts
27+
from tests.conftest import (
28+
assert_data_got_restored,
29+
update_coredns_hosts,
30+
wait_for_primary,
31+
)
2932

3033
TEST_DATA = {"_id": "unique_id", "name": "John", "address": "Highway 37", "age": 30}
3134

@@ -362,13 +365,20 @@ def project_one(
362365
)
363366

364367
@fixture(scope="function")
365-
def mongodb_multi_one_collection(self, mongodb_multi_one: MongoDBMulti):
366-
# we instantiate the pymongo client per test to avoid flakiness as the primary and secondary might swap
367-
collection = pymongo.MongoClient(
368+
def mdb_client(self, mongodb_multi_one: MongoDBMulti):
369+
return pymongo.MongoClient(
368370
mongodb_multi_one.tester(port=MONGODB_PORT).cnx_string,
369371
**mongodb_multi_one.tester(port=MONGODB_PORT).default_opts,
370-
)["testdb"]
371-
return collection["testcollection"]
372+
readPreference="primary", # let's read from the primary and not stale data from the secondary
373+
)
374+
375+
@fixture(scope="function")
376+
def mongodb_multi_one_collection(self, mdb_client):
377+
378+
# Ensure primary is available before proceeding
379+
wait_for_primary(mdb_client)
380+
381+
return mdb_client["testdb"]["testcollection"]
372382

373383
@fixture(scope="module")
374384
def mongodb_multi_one(
@@ -472,6 +482,7 @@ def test_add_test_data(self, mongodb_multi_one_collection):
472482
print(e)
473483
max_attempts -= 1
474484
time.sleep(6)
485+
raise Exception("❌ Failed to insert test data after multiple attempts")
475486

476487
@mark.e2e_multi_cluster_backup_restore
477488
def test_mdb_backed_up(self, project_one: OMTester):
@@ -489,14 +500,17 @@ def test_pit_restore(self, project_one: OMTester):
489500
now_millis = time_to_millis(datetime.datetime.now())
490501
print("\nCurrent time (millis): {}".format(now_millis))
491502

492-
pit_datetme = datetime.datetime.now() - datetime.timedelta(seconds=15)
493-
pit_millis = time_to_millis(pit_datetme)
494-
print("Restoring back to the moment 15 seconds ago (millis): {}".format(pit_millis))
503+
backup_completion_time = project_one.get_latest_backup_completion_time()
504+
print("\nbackup_completion_time: {}".format(backup_completion_time))
505+
506+
pit_millis = backup_completion_time + 1500
507+
508+
print(f"Restoring back to: {pit_millis}")
495509

496510
project_one.create_restore_job_pit(pit_millis)
497511

498512
@mark.e2e_multi_cluster_backup_restore
499-
def test_data_got_restored(self, mongodb_multi_one_collection):
513+
def test_data_got_restored(self, mongodb_multi_one_collection, mdb_client):
500514
assert_data_got_restored(TEST_DATA, mongodb_multi_one_collection, timeout=1200)
501515

502516

0 commit comments

Comments
 (0)