Skip to content

Commit 2dc5093

Browse files
matthewdaleJibola
authored andcommitted
DRIVERS-2964 Check primary region again after waiting for 30 minutes on assertPrimaryRegion failure. (mongodb-labs#215)
1 parent be2d8cd commit 2dc5093

File tree

1 file changed

+36
-16
lines changed

1 file changed

+36
-16
lines changed

astrolabe/atlas_runner.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,8 @@ def run(self, persist_cluster=False, startup_time=1):
282282
f"Waiting up to {timeout}s for primary node to be in region '{region}'"
283283
)
284284

285-
ok = False
286285
with mongo_client(self.get_connection_string()) as mc:
286+
ok = False
287287
while timer.elapsed < timeout:
288288
rsc = mc.admin.command("replSetGetConfig")
289289
members = rsc["config"]["members"]
@@ -300,24 +300,44 @@ def run(self, persist_cluster=False, startup_time=1):
300300

301301
sleep(5)
302302

303-
# If the primary isn't in the target region by the timeout,
304-
# log the cluster state every 5 seconds for the next 30
305-
# minutes to help us understand confusing behavior with the
306-
# cluster state. After logging the cluster state for 30
307-
# minutes, raise an exception.
308-
#
309-
# See https://jira.mongodb.org/browse/PRODTRIAGE-1232 for
310-
# more context.
311-
if not ok:
312-
msg = f"Primary node ({mc.primary}) not in expected region '{region}' within {timeout}s (current region: '{member_region}'; all members: {members})"
313-
LOGGER.error(msg)
303+
# If the primary isn't in the target region by the
304+
# timeout, log the cluster state every 5 seconds for the
305+
# next 30 minutes to help us understand confusing
306+
# behavior with the cluster state. After logging the
307+
# cluster state for 30 minutes, check if the primary is
308+
# in the target region again. If it still isn't in the
309+
# target region, raise an exception.
310+
#
311+
# See https://jira.mongodb.org/browse/PRODTRIAGE-1232
312+
# and https://jira.mongodb.org/browse/DRIVERS-2964 for
313+
# more context.
314+
#
315+
# TODO: Figure out a more reliable way to check for
316+
# cluster updates, or figure out why cluster state is
317+
# unreliable, then remove this extra logging and extra
318+
# region check.
319+
if not ok:
320+
LOGGER.error(
321+
f"Primary node ({mc.primary}) not in expected region '{region}' within {timeout}s. (current region: '{member_region}'; all members: {members})"
322+
)
323+
LOGGER.info("Logging cluster state for 30m after assertPrimaryRegion failure, then checking primary region again.")
324+
self.log_cluster_status(timeout=1800)
314325

315-
LOGGER.info("Logging cluster state for 30m after assertPrimaryRegion failure.")
316-
self.log_cluster_status(timeout=1800)
326+
rsc = mc.admin.command("replSetGetConfig")
327+
members = rsc["config"]["members"]
328+
member = next(
329+
m
330+
for m in members
331+
if m["horizons"]["PUBLIC"] == "%s:%s" % mc.primary
332+
)
333+
member_region = member["tags"]["region"]
317334

318-
raise Exception(msg)
335+
if region != member_region:
336+
raise Exception(
337+
f"Primary node ({mc.primary}) still not in expected region '{region}' after waiting an extra 30m. (current region: '{member_region}'; all members: {members})"
338+
)
319339

320-
LOGGER.debug(
340+
LOGGER.info(
321341
f"Waited for {timer.elapsed}s for primary node to be in region '{region}'"
322342
)
323343

0 commit comments

Comments
 (0)