@@ -281,8 +281,10 @@ def run(self, persist_cluster=False, startup_time=1):
281
281
LOGGER .debug (
282
282
f"Waiting up to { timeout } s for primary node to be in region '{ region } '"
283
283
)
284
+
285
+ ok = False
284
286
with mongo_client (self .get_connection_string ()) as mc :
285
- while True :
287
+ while timer . elapsed < timeout :
286
288
rsc = mc .admin .command ("replSetGetConfig" )
287
289
members = rsc ["config" ]["members" ]
288
290
member = next (
@@ -293,14 +295,28 @@ def run(self, persist_cluster=False, startup_time=1):
293
295
member_region = member ["tags" ]["region" ]
294
296
295
297
if region == member_region :
298
+ ok = True
296
299
break
297
300
298
- if timer .elapsed > timeout :
299
- raise Exception (
300
- f"Primary node ({ mc .primary } ) not in expected region '{ region } ' within { timeout } s (current region: '{ member_region } '; all members: { members } )"
301
- )
302
301
sleep (5 )
303
302
303
+ # If the primary isn't in the target region by the timeout,
304
+ # log the cluster state every 5 seconds for the next 30
305
+ # minutes to help us understand confusing behavior with the
306
+ # cluster state. After logging the cluster state for 30
307
+ # minutes, raise an exception.
308
+ #
309
+ # See https://jira.mongodb.org/browse/PRODTRIAGE-1232 for
310
+ # more context.
311
+ if not ok :
312
+ msg = f"Primary node ({ mc .primary } ) not in expected region '{ region } ' within { timeout } s (current region: '{ member_region } '; all members: { members } )"
313
+ LOGGER .error (msg )
314
+
315
+ LOGGER .info ("Logging cluster state for 30m after assertPrimaryRegion failure." )
316
+ self .log_cluster_status (timeout = 1800 )
317
+
318
+ raise Exception (msg )
319
+
304
320
LOGGER .debug (
305
321
f"Waited for { timer .elapsed } s for primary node to be in region '{ region } '"
306
322
)
@@ -361,6 +377,28 @@ def run(self, persist_cluster=False, startup_time=1):
361
377
finally :
362
378
self .workload_runner .terminate ()
363
379
380
+ def log_cluster_status (self , timeout = 1800 ):
381
+ timer = Timer ()
382
+ timer .start ()
383
+
384
+ LOGGER .info ("Cluster %s: logging cluster state for %.1fs" , self .cluster_name , timeout )
385
+
386
+ while timer .elapsed < timeout :
387
+ sleep (5 )
388
+
389
+ try :
390
+ cluster_info = self .cluster_url .get ().data
391
+ except AtlasClientError as e :
392
+ LOGGER .error ("Cluster %s: Error getting cluster status: %e" , self .cluster_name , e )
393
+ continue
394
+
395
+ LOGGER .info (
396
+ "Cluster %s: state: %s; logged for %.1fs" ,
397
+ self .cluster_name ,
398
+ cluster_info .stateName ,
399
+ timer .elapsed ,
400
+ )
401
+
364
402
def wait_for_state (self , target_state ):
365
403
LOGGER .info (
366
404
"Cluster %s: Waiting for cluster to be target state %s" ,
0 commit comments