Skip to content

Commit ed261a0

Browse files
authored
Merge pull request ceph#60431 from VallariAg/fix-nvmeof-ns-test
qa: fix nvmeof scalability and namespace test
2 parents 672d7a1 + 09ade3d commit ed261a0

File tree

13 files changed

+128
-53
lines changed

13 files changed

+128
-53
lines changed

qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
# runs on default nvmeof image (i.e. DEFAULT_NVMEOF_IMAGE)
12
tasks:
23
- nvmeof:
34
installer: host.a
4-
gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
5+
gw_image: default # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
56
rbd:
67
pool_name: mypool
78
image_name_prefix: myimage

qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ tasks:
1818
clients:
1919
client.0:
2020
- nvmeof/setup_subsystem.sh
21+
- nvmeof/basic_tests.sh
2122
env:
2223
RBD_POOL: mypool
2324
RBD_IMAGE_PREFIX: myimage
@@ -27,7 +28,6 @@ tasks:
2728
timeout: 30m
2829
clients:
2930
client.0:
30-
- nvmeof/basic_tests.sh
3131
- nvmeof/fio_test.sh --rbd_iostat
3232
client.1:
3333
- nvmeof/basic_tests.sh

qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ tasks:
3131
no_coverage_and_limits: true
3232
timeout: 30m
3333
clients:
34-
client.0:
34+
client.3:
3535
- nvmeof/scalability_test.sh nvmeof.a,nvmeof.b
3636
- nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d
37+
- nvmeof/scalability_test.sh nvmeof.b,nvmeof.c
3738
env:
3839
SCALING_DELAYS: '50'
40+
RBD_POOL: mypool
41+
NVMEOF_GROUP: mygroup0
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
tasks:
2+
- nvmeof:
3+
installer: host.a
4+
gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
5+
rbd:
6+
pool_name: mypool
7+
image_name_prefix: myimage
8+
gateway_config:
9+
subsystems_count: 10
10+
namespaces_count: 90 # each subsystem
11+
cli_image: quay.io/ceph/nvmeof-cli:latest
12+
13+
- cephadm.wait_for_service:
14+
service: nvmeof.mypool.mygroup0
15+
16+
- cephadm.exec:
17+
host.a:
18+
- ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml
19+
- cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml
20+
- "sed -i '/ pool: mypool/a\\ spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml"
21+
- cat /tmp/nvmeof-no-huge-page.yaml
22+
- ceph orch ls --refresh
23+
- ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml
24+
- ceph orch redeploy nvmeof.mypool.mygroup0
25+
26+
- cephadm.wait_for_service:
27+
service: nvmeof.mypool.mygroup0
28+
29+
- workunit:
30+
no_coverage_and_limits: true
31+
clients:
32+
client.0:
33+
- nvmeof/setup_subsystem.sh
34+
- nvmeof/basic_tests.sh
35+
env:
36+
RBD_POOL: mypool
37+
RBD_IMAGE_PREFIX: myimage

qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml renamed to qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ tasks:
66
pool_name: mypool
77
image_name_prefix: myimage
88
gateway_config:
9-
subsystems_count: 3
10-
namespaces_count: 20 # each subsystem
9+
subsystems_count: 120
10+
namespaces_count: 8 # each subsystem
1111
cli_image: quay.io/ceph/nvmeof-cli:latest
1212

1313
- cephadm.wait_for_service:

qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ overrides:
1111
- NVMEOF_SINGLE_GATEWAY
1212
- NVMEOF_GATEWAY_DOWN
1313
- are in unavailable state
14+
- is unavailable
1415
- is in error state
1516
- failed cephadm daemon
1617

qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ overrides:
66
- NVMEOF_SINGLE_GATEWAY
77
- NVMEOF_GATEWAY_DOWN
88
- are in unavailable state
9+
- is unavailable
910
- is in error state
1011
- failed cephadm daemon
1112

1213
tasks:
1314
- nvmeof.thrash:
1415
checker_host: 'client.0'
16+
randomize: False
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
tasks:
22
- workunit:
33
no_coverage_and_limits: true
4-
timeout: 30m
4+
timeout: 60m
55
clients:
66
client.0:
7-
- nvmeof/fio_test.sh --rbd_iostat
7+
- nvmeof/fio_test.sh --random_devices 200
88
env:
99
RBD_POOL: mypool
1010
IOSTAT_INTERVAL: '10'
11-
RUNTIME: '600'
11+
RUNTIME: '1800'

qa/tasks/nvmeof.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,11 @@ def deploy_nvmeof(self):
128128

129129
total_images = int(self.namespaces_count) * int(self.subsystems_count)
130130
log.info(f'[nvmeof]: creating {total_images} images')
131+
rbd_create_cmd = []
131132
for i in range(1, total_images + 1):
132133
imagename = self.image_name_prefix + str(i)
133-
log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}')
134-
_shell(self.ctx, self.cluster_name, self.remote, [
135-
'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}'
136-
])
134+
rbd_create_cmd += ['rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}', run.Raw(';')]
135+
_shell(self.ctx, self.cluster_name, self.remote, rbd_create_cmd)
137136

138137
for role, i in daemons.items():
139138
remote, id_ = i
@@ -251,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet):
251250
252251
daemon_max_thrash_times:
253252
For now, NVMeoF daemons have limitation that each daemon can
254-
be thrashed only 3 times in span of 30 mins. This option
253+
be thrashed only 5 times in span of 30 mins. This option
255254
allows to set the amount of times it could be thrashed in a period
256-
of time. (default: 3)
255+
of time. (default: 5)
257256
daemon_max_thrash_period:
258257
This option goes with the above option. It sets the period of time
259258
over which each daemons can be thrashed for daemon_max_thrash_times
@@ -306,12 +305,12 @@ def __init__(self, ctx, config, daemons) -> None:
306305
self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1))
307306

308307
# Limits on thrashing each daemon
309-
self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3))
308+
self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 5))
310309
self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds
311310

312311
self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60))
313312
self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30))
314-
self.min_revive_delay = int(self.config.get('min_revive_delay', 100))
313+
self.min_revive_delay = int(self.config.get('min_revive_delay', 60))
315314
self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30))
316315

317316
def _get_devices(self, remote):
@@ -347,6 +346,7 @@ def do_checks(self):
347346
run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof',
348347
run.Raw('&&'), 'ceph', 'health', 'detail',
349348
run.Raw('&&'), 'ceph', '-s',
349+
run.Raw('&&'), 'sudo', 'nvme', 'list',
350350
]
351351
for dev in self.devices:
352352
check_cmd += [
@@ -421,13 +421,11 @@ def do_thrash(self):
421421
while not self.stopping.is_set():
422422
killed_daemons = defaultdict(list)
423423

424-
weight = 1.0 / len(self.daemons)
425-
count = 0
424+
thrash_daemon_num = self.rng.randint(1, self.max_thrash_daemons)
425+
selected_daemons = self.rng.sample(self.daemons, thrash_daemon_num)
426426
for daemon in self.daemons:
427-
skip = self.rng.uniform(0.0, 1.0)
428-
if weight <= skip:
429-
self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
430-
label=daemon.id_, skip=skip, weight=weight))
427+
if daemon not in selected_daemons:
428+
self.log(f'skipping daemon {daemon.id_} ...')
431429
continue
432430

433431
# For now, nvmeof daemons can only be thrashed 3 times in last 30mins.
@@ -445,17 +443,11 @@ def do_thrash(self):
445443
continue
446444

447445
self.log('kill {label}'.format(label=daemon.id_))
448-
# daemon.stop()
449446
kill_method = self.kill_daemon(daemon)
450447

451448
killed_daemons[kill_method].append(daemon)
452449
daemons_thrash_history[daemon.id_] += [datetime.now()]
453450

454-
# only thrash max_thrash_daemons amount of daemons
455-
count += 1
456-
if count >= self.max_thrash_daemons:
457-
break
458-
459451
if killed_daemons:
460452
iteration_summary = "thrashed- "
461453
for kill_method in killed_daemons:
@@ -468,7 +460,7 @@ def do_thrash(self):
468460

469461
self.log(f'waiting for {revive_delay} secs before reviving')
470462
time.sleep(revive_delay) # blocking wait
471-
self.log('done waiting before reviving')
463+
self.log(f'done waiting before reviving - iteration #{len(summary)}: {iteration_summary}')
472464

473465
self.do_checks()
474466
self.switch_task()
@@ -487,7 +479,7 @@ def do_thrash(self):
487479
if thrash_delay > 0.0:
488480
self.log(f'waiting for {thrash_delay} secs before thrashing')
489481
time.sleep(thrash_delay) # blocking
490-
self.log('done waiting before thrashing')
482+
self.log('done waiting before thrashing - everything should be up now')
491483

492484
self.do_checks()
493485
self.switch_task()

qa/workunits/nvmeof/basic_tests.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ disconnect_all() {
3838
connect_all() {
3939
sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
4040
sleep 5
41-
output=$(sudo nvme list --output-format=json)
42-
if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then
41+
expected_devices_count=$1
42+
actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l)
43+
if [ "$actual_devices" -ne "$expected_devices_count" ]; then
44+
sudo nvme list --output-format=json
4345
return 1
4446
fi
4547
}
@@ -72,11 +74,13 @@ test_run connect
7274
test_run list_subsys 1
7375
test_run disconnect_all
7476
test_run list_subsys 0
75-
test_run connect_all
77+
devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT))
78+
test_run connect_all $devices_count
7679
gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
7780
multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT))
7881
test_run list_subsys $multipath_count
7982

8083

84+
8185
echo "-------------Test Summary-------------"
8286
echo "[nvmeof] All nvmeof basic tests passed!"

0 commit comments

Comments
 (0)