Skip to content

Commit d17c681

Browse files
committed
Merge PR ceph#56052 into main
* refs/pull/56052/head: qa/suites: ignore unresponsive client when the test passes qa: enhance per-client labelled perf counters test Reviewed-by: Venky Shankar <[email protected]>
2 parents 9120b86 + 5f2a3a7 commit d17c681

File tree

2 files changed

+91
-17
lines changed

2 files changed

+91
-17
lines changed

qa/suites/fs/functional/tasks/admin.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ overrides:
99
- \(MDS_TRIM\)
1010
- \(MDS_CLIENTS_BROKEN_ROOTSQUASH\)
1111
- report clients with broken root_squash implementation
12+
- evicting unresponsive client
1213
tasks:
1314
- cephfs_test_runner:
1415
fail_on_skip: false

qa/tasks/cephfs/test_admin.py

Lines changed: 90 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,88 @@
1616
gen_osd_cap_str, gen_mds_cap_str)
1717

1818
log = logging.getLogger(__name__)
19+
MDS_RESTART_GRACE = 60
1920

2021
class TestLabeledPerfCounters(CephFSTestCase):
2122
CLIENTS_REQUIRED = 2
2223
MDSS_REQUIRED = 1
2324

24-
def test_per_client_labeled_perf_counters(self):
25+
def _get_counters_for(self, filesystem, client_id):
26+
dump = self.fs.rank_tell(["counter", "dump"])
27+
per_client_metrics_key = f'mds_client_metrics-{filesystem}'
28+
counters = [c["counters"] for \
29+
c in dump[per_client_metrics_key] if c["labels"]["client"] == client_id]
30+
return counters[0]
31+
32+
def test_per_client_labeled_perf_counters_on_client_disconnect(self):
33+
"""
34+
That the per-client labelled metrics are unavailable during client disconnect
35+
"""
36+
mount_a_id = f'client.{self.mount_a.get_global_id()}'
37+
self.mount_a.teardown()
38+
with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed:
39+
while proceed():
40+
dump = self.fs.rank_tell(["counter", "dump"])
41+
per_client_metrics_key = f"mds_client_metrics-{dump['mds_client_metrics'][0]['labels']['fs_name']}"
42+
clients = [c["labels"]["client"] for c in dump.get(per_client_metrics_key, {})]
43+
if clients and mount_a_id not in clients:
44+
# success, no metrics.
45+
return True
46+
47+
def test_per_client_labeled_perf_counters_on_client_reconnect(self):
2548
"""
26-
That the per-client labelled perf counters depict the clients
27-
performaing IO.
49+
That the per-client labelled metrics are generated during client reconnect
2850
"""
29-
def get_counters_for(filesystem, client_id):
30-
dump = self.fs.rank_tell(["counter", "dump"])
31-
per_client_metrics_key = f'mds_client_metrics-{filesystem}'
32-
counters = [c["counters"] for \
33-
c in dump[per_client_metrics_key] if c["labels"]["client"] == client_id]
34-
return counters[0]
51+
# fail active mds and wait for reconnect
52+
mds = self.fs.get_active_names()[0]
53+
self.mds_cluster.mds_fail(mds)
54+
self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
55+
mount_a_id = f'client.{self.mount_a.get_global_id()}'
56+
mount_b_id = f'client.{self.mount_b.get_global_id()}'
57+
fs_suffix = ""
58+
59+
with safe_while(sleep=1, tries=30, action='wait for counters') as proceed:
60+
while proceed():
61+
dump = self.fs.rank_tell(["counter", "dump"])
62+
fs_suffix = dump['mds_client_metrics'][0]['labels']['fs_name']
63+
per_client_metrics_key = f"mds_client_metrics-{fs_suffix}"
64+
clients = [c["labels"]["client"] for c in dump.get(per_client_metrics_key, {})]
65+
if mount_a_id in clients and mount_b_id in clients:
66+
# success, got metrics.
67+
break # break to continue the test
68+
69+
# Post reconnecting, validate the io perf counters
70+
# write workload
71+
self.mount_a.create_n_files("test_dir/test_file", 1000, sync=True)
72+
with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed:
73+
while proceed():
74+
counters_dump_a = self._get_counters_for(fs_suffix, mount_a_id)
75+
if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0 and \
76+
counters_dump_a["avg_write_latency"] >= 0 and counters_dump_a["avg_metadata_latency"] >= 0 and \
77+
counters_dump_a["opened_files"] >= 0 and counters_dump_a["opened_inodes"] > 0 and \
78+
counters_dump_a["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \
79+
counters_dump_a["pinned_icaps"] > 0:
80+
break # break to continue the test
81+
82+
# read from the other client
83+
for i in range(100):
84+
self.mount_b.open_background(basename=f'test_dir/test_file_{i}', write=False)
85+
with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_b_id}') as proceed:
86+
while proceed():
87+
counters_dump_b = self._get_counters_for(fs_suffix, mount_b_id)
88+
if counters_dump_b["total_read_ops"] >= 0 and counters_dump_b["total_read_size"] >= 0 and \
89+
counters_dump_b["avg_read_latency"] >= 0 and counters_dump_b["avg_metadata_latency"] >= 0 and \
90+
counters_dump_b["opened_files"] >= 0 and counters_dump_b["opened_inodes"] >= 0 and \
91+
counters_dump_b["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \
92+
counters_dump_b["pinned_icaps"] > 0:
93+
break # break to continue the test
94+
self.mount_a.teardown()
95+
self.mount_b.teardown()
3596

97+
def test_per_client_labeled_perf_counters_io(self):
98+
"""
99+
That the per-client labelled perf counters depict the clients performing IO.
100+
"""
36101
# sleep a bit so that we get updated clients...
37102
sleep(10)
38103

@@ -53,21 +118,29 @@ def get_counters_for(filesystem, client_id):
53118
# write workload
54119
self.mount_a.create_n_files("test_dir/test_file", 1000, sync=True)
55120
with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed:
56-
counters_dump_a = get_counters_for(fs_suffix, mount_a_id)
57121
while proceed():
58-
if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0:
59-
return True
122+
counters_dump_a = self._get_counters_for(fs_suffix, mount_a_id)
123+
if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0 and \
124+
counters_dump_a["avg_write_latency"] >= 0 and counters_dump_a["avg_metadata_latency"] >= 0 and \
125+
counters_dump_a["opened_files"] >= 0 and counters_dump_a["opened_inodes"] > 0 and \
126+
counters_dump_a["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \
127+
counters_dump_a["pinned_icaps"] > 0:
128+
break # break to continue the test
60129

61130
# read from the other client
62131
for i in range(100):
63132
self.mount_b.open_background(basename=f'test_dir/test_file_{i}', write=False)
64133
with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_b_id}') as proceed:
65-
counters_dump_b = get_counters_for(fs_suffix, mount_b_id)
66134
while proceed():
67-
if counters_dump_b["total_read_ops"] > 0 and counters_dump_b["total_read_size"] > 0:
68-
return True
69-
70-
self.fs.teardown()
135+
counters_dump_b = self._get_counters_for(fs_suffix, mount_b_id)
136+
if counters_dump_b["total_read_ops"] >= 0 and counters_dump_b["total_read_size"] >= 0 and \
137+
counters_dump_b["avg_read_latency"] >= 0 and counters_dump_b["avg_metadata_latency"] >= 0 and \
138+
counters_dump_b["opened_files"] >= 0 and counters_dump_b["opened_inodes"] >= 0 and \
139+
counters_dump_b["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \
140+
counters_dump_b["pinned_icaps"] > 0:
141+
break # break to continue the test
142+
self.mount_a.teardown()
143+
self.mount_b.teardown()
71144

72145
class TestAdminCommands(CephFSTestCase):
73146
"""

0 commit comments

Comments
 (0)