99import json
1010import math
1111from teuthology import misc as teuthology
12+ from teuthology .contextutil import safe_while
1213from tasks import ceph_manager
1314from tasks .cephfs .filesystem import MDSCluster
1415from tasks .thrasher import Thrasher
@@ -224,6 +225,25 @@ def max_killable(self):
224225 else :
225226 return m
226227
228+ def _wait_until_quorum (self , mon , size , timeout = 300 ):
229+ """
230+ Wait until the monitor specified is in the quorum.
231+ """
232+ self .log ('waiting for quorum size %d for mon %s' % (size , mon ))
233+ s = {}
234+
235+ with safe_while (sleep = 3 ,
236+ tries = timeout // 3 ,
237+ action = f'wait for quorum size { size } on mon { mon } ' ) as proceed :
238+ while proceed ():
239+ s = self .manager .get_mon_status (mon )
240+ if len (s ['quorum' ]) == size :
241+ break
242+ self .log ("quorum is size %d" % len (s ['quorum' ]))
243+
244+ self .log ("final quorum is size %d" % len (s ['quorum' ]))
245+ return s
246+
227247 def do_thrash (self ):
228248 """
229249 _do_thrash() wrapper.
@@ -261,7 +281,11 @@ def _do_thrash(self):
261281 self .manager .wait_for_mon_quorum_size (len (mons ))
262282 self .log ('making sure all monitors are in the quorum' )
263283 for m in mons :
264- s = self .manager .get_mon_status (m )
284+ try :
285+ s = self ._wait_until_quorum (m , len (mons ), timeout = 30 )
286+ except Exception as e :
287+ self .log ('mon.{m} is not in quorum size, exception: {e}' .format (m = m ,e = e ))
288+ self .log ('mon_status: {s}' .format (s = s ))
265289 assert s ['state' ] == 'leader' or s ['state' ] == 'peon'
266290 assert len (s ['quorum' ]) == len (mons )
267291
@@ -300,7 +324,12 @@ def _do_thrash(self):
300324 for m in mons :
301325 if m in mons_to_kill :
302326 continue
303- s = self .manager .get_mon_status (m )
327+ try :
328+ s = self ._wait_until_quorum (m , len (mons )- len (mons_to_kill ), timeout = 30 )
329+ except Exception as e :
330+ self .log ('mon.{m} is not in quorum size, exception: {e}' .format (m = m ,e = e ))
331+ self .log ('mon_status: {s}' .format (s = s ))
332+
304333 assert s ['state' ] == 'leader' or s ['state' ] == 'peon'
305334 assert len (s ['quorum' ]) == len (mons )- len (mons_to_kill )
306335
@@ -322,7 +351,12 @@ def _do_thrash(self):
322351
323352 self .manager .wait_for_mon_quorum_size (len (mons ))
324353 for m in mons :
325- s = self .manager .get_mon_status (m )
354+ try :
355+ s = self ._wait_until_quorum (m , len (mons ), timeout = 30 )
356+ except Exception as e :
357+ self .log ('mon.{m} is not in quorum size, exception: {e}' .format (m = m ,e = e ))
358+ self .log ('mon_status: {s}' .format (s = s ))
359+
326360 assert s ['state' ] == 'leader' or s ['state' ] == 'peon'
327361 assert len (s ['quorum' ]) == len (mons )
328362
0 commit comments