Skip to content

Commit fbd10ba

Browse files
committed
test: monitor thrasher wait until quorum
With 1 sec. delay we may sometimes fail to get correct length of quorum since the monitor didn't updated on time. With the following fix, we will wait for quorum and check every few seconds (3) until timeout (30). Fixes: https://tracker.ceph.com/issues/52316 Signed-off-by: Nitzan Mordechai <[email protected]>
1 parent 9bbf3b1 commit fbd10ba

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

qa/tasks/mon_thrash.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import json
1010
import math
1111
from teuthology import misc as teuthology
12+
from teuthology.contextutil import safe_while
1213
from tasks import ceph_manager
1314
from tasks.cephfs.filesystem import MDSCluster
1415
from tasks.thrasher import Thrasher
@@ -224,6 +225,25 @@ def max_killable(self):
224225
else:
225226
return m
226227

228+
def _wait_until_quorum(self, mon, size, timeout=300):
229+
"""
230+
Wait until the monitor specified is in the quorum.
231+
"""
232+
self.log('waiting for quorum size %d for mon %s' % (size, mon))
233+
s = {}
234+
235+
with safe_while(sleep=3,
236+
tries=timeout // 3,
237+
action=f'wait for quorum size {size} on mon {mon}') as proceed:
238+
while proceed():
239+
s = self.manager.get_mon_status(mon)
240+
if len(s['quorum']) == size:
241+
break
242+
self.log("quorum is size %d" % len(s['quorum']))
243+
244+
self.log("final quorum is size %d" % len(s['quorum']))
245+
return s
246+
227247
def do_thrash(self):
228248
"""
229249
_do_thrash() wrapper.
@@ -261,7 +281,11 @@ def _do_thrash(self):
261281
self.manager.wait_for_mon_quorum_size(len(mons))
262282
self.log('making sure all monitors are in the quorum')
263283
for m in mons:
264-
s = self.manager.get_mon_status(m)
284+
try:
285+
s = self._wait_until_quorum(m, len(mons), timeout=30)
286+
except Exception as e:
287+
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
288+
self.log('mon_status: {s}'.format(s=s))
265289
assert s['state'] == 'leader' or s['state'] == 'peon'
266290
assert len(s['quorum']) == len(mons)
267291

@@ -300,7 +324,12 @@ def _do_thrash(self):
300324
for m in mons:
301325
if m in mons_to_kill:
302326
continue
303-
s = self.manager.get_mon_status(m)
327+
try:
328+
s = self._wait_until_quorum(m, len(mons)-len(mons_to_kill), timeout=30)
329+
except Exception as e:
330+
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
331+
self.log('mon_status: {s}'.format(s=s))
332+
304333
assert s['state'] == 'leader' or s['state'] == 'peon'
305334
assert len(s['quorum']) == len(mons)-len(mons_to_kill)
306335

@@ -322,7 +351,12 @@ def _do_thrash(self):
322351

323352
self.manager.wait_for_mon_quorum_size(len(mons))
324353
for m in mons:
325-
s = self.manager.get_mon_status(m)
354+
try:
355+
s = self._wait_until_quorum(m, len(mons), timeout=30)
356+
except Exception as e:
357+
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
358+
self.log('mon_status: {s}'.format(s=s))
359+
326360
assert s['state'] == 'leader' or s['state'] == 'peon'
327361
assert len(s['quorum']) == len(mons)
328362

0 commit comments

Comments
 (0)