Skip to content

Commit 6546da1

Browse files
authored
Merge pull request ceph#61357 from VallariAg/wip-nvmeof-teuthology-test-fix-ha
qa: fix nvmeof teuthology thrasher fix
2 parents 21f73a4 + 3b9b290 commit 6546da1

File tree

6 files changed

+95
-29
lines changed

6 files changed

+95
-29
lines changed

qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,6 @@ tasks:
3636
- nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d
3737
- nvmeof/scalability_test.sh nvmeof.b,nvmeof.c
3838
env:
39-
SCALING_DELAYS: '50'
39+
SCALING_DELAYS: '120'
4040
RBD_POOL: mypool
4141
NVMEOF_GROUP: mygroup0

qa/suites/nvmeof/thrash/base/install.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ tasks:
44
extra_packages:
55
- nvme-cli
66
- cephadm:
7-
watchdog_setup:
87
- cephadm.shell:
98
host.a:
109
# get state before nvmeof deployment
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
tasks:
2+
- nvmeof:
3+
installer: host.a
4+
gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
5+
rbd:
6+
pool_name: mypool
7+
image_name_prefix: myimage
8+
gateway_config:
9+
subsystems_count: 2
10+
namespaces_count: 8 # each subsystem
11+
cli_image: quay.io/ceph/nvmeof-cli:latest
12+
13+
- cephadm.wait_for_service:
14+
service: nvmeof.mypool.mygroup0
15+
16+
- workunit:
17+
no_coverage_and_limits: true
18+
clients:
19+
client.0:
20+
- nvmeof/setup_subsystem.sh
21+
- nvmeof/basic_tests.sh
22+
env:
23+
RBD_POOL: mypool
24+
RBD_IMAGE_PREFIX: myimage

qa/tasks/nvmeof.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,22 @@ def set_gateway_cfg(self):
209209
if self.create_mtls_secrets:
210210
self.write_mtls_config(gateway_ips)
211211
log.info("[nvmeof]: executed set_gateway_cfg successfully!")
212+
213+
def teardown(self):
214+
log.info("[nvmeof] Removing nvmeof service")
215+
_shell(self.ctx, self.cluster_name, self.remote, [
216+
'ceph', 'orch', 'host', 'ls'
217+
])
218+
for i in range(self.groups_count):
219+
group_name = self.groups_prefix + str(i)
220+
service_name = f"nvmeof.{self.poolname}.{group_name}"
221+
_shell(self.ctx, self.cluster_name, self.remote, [
222+
'ceph', 'orch', 'rm', service_name
223+
])
224+
_shell(self.ctx, self.cluster_name, self.remote, [
225+
'ceph', 'orch', 'host', 'ls'
226+
])
227+
log.info("[nvmeof] Nvmeof teardown completed!")
212228

213229

214230
class NvmeofThrasher(Thrasher, Greenlet):
@@ -334,26 +350,41 @@ def _run(self): # overriding
334350
def stop(self):
335351
self.stopping.set()
336352

353+
def stop_and_join(self):
354+
"""
355+
Stop the thrashing process and join the thread.
356+
"""
357+
self.stop()
358+
return self.join()
359+
337360
def do_checks(self):
338361
"""
339362
Run some checks to see if everything is running well during thrashing.
340363
"""
341364
self.log('display and verify stats:')
342-
for d in self.daemons:
343-
d.remote.sh(d.status_cmd, check_status=False)
344-
check_cmd = [
345-
'ceph', 'orch', 'ls',
346-
run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof',
347-
run.Raw('&&'), 'ceph', 'health', 'detail',
348-
run.Raw('&&'), 'ceph', '-s',
349-
run.Raw('&&'), 'sudo', 'nvme', 'list',
350-
]
351-
for dev in self.devices:
352-
check_cmd += [
353-
run.Raw('&&'), 'sudo', 'nvme', 'list-subsys', dev,
354-
run.Raw('|'), 'grep', 'live optimized'
355-
]
356-
self.checker_host.run(args=check_cmd).wait()
365+
for retry in range(5):
366+
try:
367+
random_gateway_host = None
368+
initiator_host = self.checker_host
369+
for d in self.daemons:
370+
random_gateway_host = d.remote
371+
d.remote.sh(d.status_cmd, check_status=False)
372+
random_gateway_host.run(args=['ceph', 'orch', 'ls', '--refresh'])
373+
random_gateway_host.run(args=['ceph', 'orch', 'ps', '--daemon-type', 'nvmeof', '--refresh'])
374+
random_gateway_host.run(args=['ceph', 'health', 'detail'])
375+
random_gateway_host.run(args=['ceph', '-s'])
376+
random_gateway_host.run(args=['ceph', 'nvme-gw', 'show', 'mypool', 'mygroup0'])
377+
378+
initiator_host.run(args=['sudo', 'nvme', 'list'])
379+
for dev in self.devices:
380+
device_check_cmd = [
381+
'sudo', 'nvme', 'list-subsys', dev,
382+
run.Raw('|'), 'grep', 'live optimized'
383+
]
384+
initiator_host.run(args=device_check_cmd)
385+
break
386+
except run.CommandFailedError:
387+
self.log(f"retry do_checks() for {retry} time")
357388

358389
def switch_task(self):
359390
"""
@@ -373,13 +404,14 @@ def switch_task(self):
373404
):
374405
other_thrasher = t
375406
self.log('switch_task: waiting for other thrasher')
376-
other_thrasher.switch_thrasher.wait(300)
407+
other_thrasher.switch_thrasher.wait(600)
377408
self.log('switch_task: done waiting for the other thrasher')
378409
other_thrasher.switch_thrasher.clear()
379410

380411
def kill_daemon(self, daemon):
381412
kill_methods = [
382-
"ceph_daemon_stop", "systemctl_stop",
413+
"ceph_daemon_stop",
414+
# "systemctl_stop",
383415
"daemon_remove",
384416
]
385417
chosen_method = self.rng.choice(kill_methods)
@@ -390,7 +422,8 @@ def kill_daemon(self, daemon):
390422
d_name
391423
], check_status=False)
392424
elif chosen_method == "systemctl_stop":
393-
daemon.stop()
425+
# To bypass is_started logic of CephadmUnit
426+
daemon.remote.sh(daemon.stop_cmd, check_status=False)
394427
elif chosen_method == "daemon_remove":
395428
daemon.remote.run(args=[
396429
"ceph", "orch", "daemon", "rm",
@@ -399,14 +432,24 @@ def kill_daemon(self, daemon):
399432
return chosen_method
400433

401434
def revive_daemon(self, daemon, killed_method):
435+
name = '%s.%s' % (daemon.type_, daemon.id_)
402436
if killed_method == "ceph_daemon_stop":
403-
name = '%s.%s' % (daemon.type_, daemon.id_)
404437
daemon.remote.run(args=[
405438
"ceph", "orch", "daemon", "restart",
406439
name
407440
])
441+
# note: temporarily use 'daemon start' to restart
442+
# daemons instead of 'systemctl start'
408443
elif killed_method == "systemctl_stop":
409-
daemon.restart()
444+
daemon.remote.run(args=[
445+
"ceph", "orch", "daemon", "start",
446+
name
447+
])
448+
else:
449+
daemon.remote.run(args=[
450+
"ceph", "orch", "daemon", "start",
451+
name
452+
])
410453

411454
def do_thrash(self):
412455
self.log('start thrashing')
@@ -515,6 +558,9 @@ def end(self):
515558
self.thrasher.join()
516559
log.info('done joining')
517560

561+
def teardown(self):
562+
log.info('tearing down nvmeof thrasher...')
563+
518564

519565
task = Nvmeof
520566
thrash = ThrashTest

qa/workunits/nvmeof/fio_test.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ fi
5050

5151

5252
RUNTIME=${RUNTIME:-600}
53-
53+
filename=$(echo "$selected_drives" | sed -z 's/\n/:\/dev\//g' | sed 's/:\/dev\/$//')
54+
filename="/dev/$filename"
5455

5556
cat >> $fio_file <<EOF
5657
[nvmeof-fio-test]
@@ -61,7 +62,7 @@ size=${SIZE:-1G}
6162
time_based=1
6263
runtime=$RUNTIME
6364
rw=${RW:-randrw}
64-
filename=$(echo "$selected_drives" | tr '\n' ':' | sed 's/:$//')
65+
filename=${filename}
6566
verify=md5
6667
verify_fatal=1
6768
direct=1
@@ -79,6 +80,5 @@ if [ "$rbd_iostat" = true ]; then
7980
fi
8081
fio --showcmd $fio_file
8182
sudo fio $fio_file
82-
wait
8383

8484
echo "[nvmeof.fio] fio test successful!"

qa/workunits/nvmeof/scalability_test.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,7 @@ status_checks() {
3232
return 1
3333
fi
3434

35-
ceph_status=$(ceph -s)
36-
if ! echo "$ceph_status" | grep -q "HEALTH_OK"; then
37-
return 1
38-
fi
35+
ceph -s
3936
}
4037

4138
total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))

0 commit comments

Comments
 (0)