@@ -209,6 +209,22 @@ def set_gateway_cfg(self):
209209 if self .create_mtls_secrets :
210210 self .write_mtls_config (gateway_ips )
211211 log .info ("[nvmeof]: executed set_gateway_cfg successfully!" )
212+
213+ def teardown (self ):
214+ log .info ("[nvmeof] Removing nvmeof service" )
215+ _shell (self .ctx , self .cluster_name , self .remote , [
216+ 'ceph' , 'orch' , 'host' , 'ls'
217+ ])
218+ for i in range (self .groups_count ):
219+ group_name = self .groups_prefix + str (i )
220+ service_name = f"nvmeof.{ self .poolname } .{ group_name } "
221+ _shell (self .ctx , self .cluster_name , self .remote , [
222+ 'ceph' , 'orch' , 'rm' , service_name
223+ ])
224+ _shell (self .ctx , self .cluster_name , self .remote , [
225+ 'ceph' , 'orch' , 'host' , 'ls'
226+ ])
227+ log .info ("[nvmeof] Nvmeof teardown completed!" )
212228
213229
214230class NvmeofThrasher (Thrasher , Greenlet ):
@@ -334,26 +350,41 @@ def _run(self): # overriding
334350 def stop (self ):
335351 self .stopping .set ()
336352
353+ def stop_and_join (self ):
354+ """
355+ Stop the thrashing process and join the thread.
356+ """
357+ self .stop ()
358+ return self .join ()
359+
337360 def do_checks (self ):
338361 """
339362 Run some checks to see if everything is running well during thrashing.
340363 """
341364 self .log ('display and verify stats:' )
342- for d in self .daemons :
343- d .remote .sh (d .status_cmd , check_status = False )
344- check_cmd = [
345- 'ceph' , 'orch' , 'ls' ,
346- run .Raw ('&&' ), 'ceph' , 'orch' , 'ps' , '--daemon-type' , 'nvmeof' ,
347- run .Raw ('&&' ), 'ceph' , 'health' , 'detail' ,
348- run .Raw ('&&' ), 'ceph' , '-s' ,
349- run .Raw ('&&' ), 'sudo' , 'nvme' , 'list' ,
350- ]
351- for dev in self .devices :
352- check_cmd += [
353- run .Raw ('&&' ), 'sudo' , 'nvme' , 'list-subsys' , dev ,
354- run .Raw ('|' ), 'grep' , 'live optimized'
355- ]
356- self .checker_host .run (args = check_cmd ).wait ()
365+ for retry in range (5 ):
366+ try :
367+ random_gateway_host = None
368+ initiator_host = self .checker_host
369+ for d in self .daemons :
370+ random_gateway_host = d .remote
371+ d .remote .sh (d .status_cmd , check_status = False )
372+ random_gateway_host .run (args = ['ceph' , 'orch' , 'ls' , '--refresh' ])
373+ random_gateway_host .run (args = ['ceph' , 'orch' , 'ps' , '--daemon-type' , 'nvmeof' , '--refresh' ])
374+ random_gateway_host .run (args = ['ceph' , 'health' , 'detail' ])
375+ random_gateway_host .run (args = ['ceph' , '-s' ])
376+ random_gateway_host .run (args = ['ceph' , 'nvme-gw' , 'show' , 'mypool' , 'mygroup0' ])
377+
378+ initiator_host .run (args = ['sudo' , 'nvme' , 'list' ])
379+ for dev in self .devices :
380+ device_check_cmd = [
381+ 'sudo' , 'nvme' , 'list-subsys' , dev ,
382+ run .Raw ('|' ), 'grep' , 'live optimized'
383+ ]
384+ initiator_host .run (args = device_check_cmd )
385+ break
386+ except run .CommandFailedError :
387+ self .log (f"retry do_checks() for { retry } time" )
357388
358389 def switch_task (self ):
359390 """
@@ -373,13 +404,14 @@ def switch_task(self):
373404 ):
374405 other_thrasher = t
375406 self .log ('switch_task: waiting for other thrasher' )
376- other_thrasher .switch_thrasher .wait (300 )
407+ other_thrasher .switch_thrasher .wait (600 )
377408 self .log ('switch_task: done waiting for the other thrasher' )
378409 other_thrasher .switch_thrasher .clear ()
379410
380411 def kill_daemon (self , daemon ):
381412 kill_methods = [
382- "ceph_daemon_stop" , "systemctl_stop" ,
413+ "ceph_daemon_stop" ,
414+ # "systemctl_stop",
383415 "daemon_remove" ,
384416 ]
385417 chosen_method = self .rng .choice (kill_methods )
@@ -390,7 +422,8 @@ def kill_daemon(self, daemon):
390422 d_name
391423 ], check_status = False )
392424 elif chosen_method == "systemctl_stop" :
393- daemon .stop ()
425+ # To bypass is_started logic of CephadmUnit
426+ daemon .remote .sh (daemon .stop_cmd , check_status = False )
394427 elif chosen_method == "daemon_remove" :
395428 daemon .remote .run (args = [
396429 "ceph" , "orch" , "daemon" , "rm" ,
@@ -399,14 +432,24 @@ def kill_daemon(self, daemon):
399432 return chosen_method
400433
401434 def revive_daemon (self , daemon , killed_method ):
435+ name = '%s.%s' % (daemon .type_ , daemon .id_ )
402436 if killed_method == "ceph_daemon_stop" :
403- name = '%s.%s' % (daemon .type_ , daemon .id_ )
404437 daemon .remote .run (args = [
405438 "ceph" , "orch" , "daemon" , "restart" ,
406439 name
407440 ])
441+ # note: temporarily use 'daemon start' to restart
442+ # daemons instead of 'systemctl start'
408443 elif killed_method == "systemctl_stop" :
409- daemon .restart ()
444+ daemon .remote .run (args = [
445+ "ceph" , "orch" , "daemon" , "start" ,
446+ name
447+ ])
448+ else :
449+ daemon .remote .run (args = [
450+ "ceph" , "orch" , "daemon" , "start" ,
451+ name
452+ ])
410453
411454 def do_thrash (self ):
412455 self .log ('start thrashing' )
@@ -515,6 +558,9 @@ def end(self):
515558 self .thrasher .join ()
516559 log .info ('done joining' )
517560
561+ def teardown (self ):
562+ log .info ('tearing down nvmeof thrasher...' )
563+
518564
519565task = Nvmeof
520566thrash = ThrashTest
0 commit comments