Skip to content

Commit be2d8cd

Browse files
matthewdaleJibola
authored andcommitted
DRIVERS-2964 Log cluster state for 30 minutes after assertPrimaryRegion failure. (mongodb-labs#214)
1 parent 75ef77b commit be2d8cd

File tree

1 file changed

+43
-5
lines changed

1 file changed

+43
-5
lines changed

astrolabe/atlas_runner.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,10 @@ def run(self, persist_cluster=False, startup_time=1):
281281
LOGGER.debug(
282282
f"Waiting up to {timeout}s for primary node to be in region '{region}'"
283283
)
284+
285+
ok = False
284286
with mongo_client(self.get_connection_string()) as mc:
285-
while True:
287+
while timer.elapsed < timeout:
286288
rsc = mc.admin.command("replSetGetConfig")
287289
members = rsc["config"]["members"]
288290
member = next(
@@ -293,14 +295,28 @@ def run(self, persist_cluster=False, startup_time=1):
293295
member_region = member["tags"]["region"]
294296

295297
if region == member_region:
298+
ok = True
296299
break
297300

298-
if timer.elapsed > timeout:
299-
raise Exception(
300-
f"Primary node ({mc.primary}) not in expected region '{region}' within {timeout}s (current region: '{member_region}'; all members: {members})"
301-
)
302301
sleep(5)
303302

303+
# If the primary isn't in the target region by the timeout,
304+
# log the cluster state every 5 seconds for the next 30
305+
# minutes to help us understand confusing behavior with the
306+
# cluster state. After logging the cluster state for 30
307+
# minutes, raise an exception.
308+
#
309+
# See https://jira.mongodb.org/browse/PRODTRIAGE-1232 for
310+
# more context.
311+
if not ok:
312+
msg = f"Primary node ({mc.primary}) not in expected region '{region}' within {timeout}s (current region: '{member_region}'; all members: {members})"
313+
LOGGER.error(msg)
314+
315+
LOGGER.info("Logging cluster state for 30m after assertPrimaryRegion failure.")
316+
self.log_cluster_status(timeout=1800)
317+
318+
raise Exception(msg)
319+
304320
LOGGER.debug(
305321
f"Waited for {timer.elapsed}s for primary node to be in region '{region}'"
306322
)
@@ -361,6 +377,28 @@ def run(self, persist_cluster=False, startup_time=1):
361377
finally:
362378
self.workload_runner.terminate()
363379

380+
def log_cluster_status(self, timeout=1800):
381+
timer = Timer()
382+
timer.start()
383+
384+
LOGGER.info("Cluster %s: logging cluster state for %.1fs", self.cluster_name, timeout)
385+
386+
while timer.elapsed < timeout:
387+
sleep(5)
388+
389+
try:
390+
cluster_info = self.cluster_url.get().data
391+
except AtlasClientError as e:
392+
LOGGER.error("Cluster %s: Error getting cluster status: %e", self.cluster_name, e)
393+
continue
394+
395+
LOGGER.info(
396+
"Cluster %s: state: %s; logged for %.1fs",
397+
self.cluster_name,
398+
cluster_info.stateName,
399+
timer.elapsed,
400+
)
401+
364402
def wait_for_state(self, target_state):
365403
LOGGER.info(
366404
"Cluster %s: Waiting for cluster to be target state %s",

0 commit comments

Comments
 (0)