@@ -128,12 +128,11 @@ def deploy_nvmeof(self):
128128
129129 total_images = int (self .namespaces_count ) * int (self .subsystems_count )
130130 log .info (f'[nvmeof]: creating { total_images } images' )
131+ rbd_create_cmd = []
131132 for i in range (1 , total_images + 1 ):
132133 imagename = self .image_name_prefix + str (i )
133- log .info (f'[nvmeof]: rbd create { poolname } /{ imagename } --size { self .rbd_size } ' )
134- _shell (self .ctx , self .cluster_name , self .remote , [
135- 'rbd' , 'create' , f'{ poolname } /{ imagename } ' , '--size' , f'{ self .rbd_size } '
136- ])
134+ rbd_create_cmd += ['rbd' , 'create' , f'{ poolname } /{ imagename } ' , '--size' , f'{ self .rbd_size } ' , run .Raw (';' )]
135+ _shell (self .ctx , self .cluster_name , self .remote , rbd_create_cmd )
137136
138137 for role , i in daemons .items ():
139138 remote , id_ = i
@@ -251,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet):
251250
252251 daemon_max_thrash_times:
253252 For now, NVMeoF daemons have limitation that each daemon can
254- be thrashed only 3 times in span of 30 mins. This option
253+ be thrashed only 5 times in span of 30 mins. This option
255254 allows to set the amount of times it could be thrashed in a period
256- of time. (default: 3 )
255+ of time. (default: 5 )
257256 daemon_max_thrash_period:
258257 This option goes with the above option. It sets the period of time
259258 over which each daemons can be thrashed for daemon_max_thrash_times
@@ -306,12 +305,12 @@ def __init__(self, ctx, config, daemons) -> None:
306305 self .max_thrash_daemons = int (self .config .get ('max_thrash' , len (self .daemons ) - 1 ))
307306
308307 # Limits on thrashing each daemon
309- self .daemon_max_thrash_times = int (self .config .get ('daemon_max_thrash_times' , 3 ))
308+ self .daemon_max_thrash_times = int (self .config .get ('daemon_max_thrash_times' , 5 ))
310309 self .daemon_max_thrash_period = int (self .config .get ('daemon_max_thrash_period' , 30 * 60 )) # seconds
311310
312311 self .min_thrash_delay = int (self .config .get ('min_thrash_delay' , 60 ))
313312 self .max_thrash_delay = int (self .config .get ('max_thrash_delay' , self .min_thrash_delay + 30 ))
314- self .min_revive_delay = int (self .config .get ('min_revive_delay' , 100 ))
313+ self .min_revive_delay = int (self .config .get ('min_revive_delay' , 60 ))
315314 self .max_revive_delay = int (self .config .get ('max_revive_delay' , self .min_revive_delay + 30 ))
316315
317316 def _get_devices (self , remote ):
@@ -347,6 +346,7 @@ def do_checks(self):
347346 run .Raw ('&&' ), 'ceph' , 'orch' , 'ps' , '--daemon-type' , 'nvmeof' ,
348347 run .Raw ('&&' ), 'ceph' , 'health' , 'detail' ,
349348 run .Raw ('&&' ), 'ceph' , '-s' ,
349+ run .Raw ('&&' ), 'sudo' , 'nvme' , 'list' ,
350350 ]
351351 for dev in self .devices :
352352 check_cmd += [
@@ -421,13 +421,11 @@ def do_thrash(self):
421421 while not self .stopping .is_set ():
422422 killed_daemons = defaultdict (list )
423423
424- weight = 1.0 / len ( self .daemons )
425- count = 0
424+ thrash_daemon_num = self . rng . randint ( 1 , self .max_thrash_daemons )
425+ selected_daemons = self . rng . sample ( self . daemons , thrash_daemon_num )
426426 for daemon in self .daemons :
427- skip = self .rng .uniform (0.0 , 1.0 )
428- if weight <= skip :
429- self .log ('skipping daemon {label} with skip ({skip}) > weight ({weight})' .format (
430- label = daemon .id_ , skip = skip , weight = weight ))
427+ if daemon not in selected_daemons :
428+ self .log (f'skipping daemon { daemon .id_ } ...' )
431429 continue
432430
433431 # For now, nvmeof daemons can only be thrashed 3 times in last 30mins.
@@ -445,17 +443,11 @@ def do_thrash(self):
445443 continue
446444
447445 self .log ('kill {label}' .format (label = daemon .id_ ))
448- # daemon.stop()
449446 kill_method = self .kill_daemon (daemon )
450447
451448 killed_daemons [kill_method ].append (daemon )
452449 daemons_thrash_history [daemon .id_ ] += [datetime .now ()]
453450
454- # only thrash max_thrash_daemons amount of daemons
455- count += 1
456- if count >= self .max_thrash_daemons :
457- break
458-
459451 if killed_daemons :
460452 iteration_summary = "thrashed- "
461453 for kill_method in killed_daemons :
@@ -468,7 +460,7 @@ def do_thrash(self):
468460
469461 self .log (f'waiting for { revive_delay } secs before reviving' )
470462 time .sleep (revive_delay ) # blocking wait
471- self .log ('done waiting before reviving' )
463+ self .log (f 'done waiting before reviving - iteration # { len ( summary ) } : { iteration_summary } ' )
472464
473465 self .do_checks ()
474466 self .switch_task ()
@@ -487,7 +479,7 @@ def do_thrash(self):
487479 if thrash_delay > 0.0 :
488480 self .log (f'waiting for { thrash_delay } secs before thrashing' )
489481 time .sleep (thrash_delay ) # blocking
490- self .log ('done waiting before thrashing' )
482+ self .log ('done waiting before thrashing - everything should be up now ' )
491483
492484 self .do_checks ()
493485 self .switch_task ()
0 commit comments