Skip to content

Commit 4401a6c

Browse files
committed
Merge PR ceph#56536 into main
* refs/pull/56536/head: mds: do not dispatch aborted internal requests qa: add test for killing quiesce_inode with outstanding remote authpin requests mds: add lockup debugging command Reviewed-by: Leonid Usov <[email protected]>
2 parents 53d906b + 604112e commit 4401a6c

File tree

3 files changed

+80
-0
lines changed

3 files changed

+80
-0
lines changed

qa/tasks/cephfs/test_quiesce.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,69 @@ def test_quiesce_path_splitauth(self):
638638
op = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--wait'], rank=0)['op']
639639
self.assertEqual(op['result'], -1) # EPERM
640640

641+
def test_quiesce_authpin_wait(self):
642+
"""
643+
That a quiesce_inode op with outstanding remote authpin requests can be killed.
644+
"""
645+
646+
self.config_set('mds', 'mds_heartbeat_grace', '60')
647+
self._configure_subvolume()
648+
self.mount_a.setfattr(".", "ceph.dir.pin.distributed", "1")
649+
self._client_background_workload()
650+
self._wait_distributed_subtrees(2*2, rank="all", path=self.mntpnt)
651+
status = self.fs.status()
652+
653+
p = self.mount_a.run_shell_payload("ls", stdout=StringIO())
654+
dirs = p.stdout.getvalue().strip().split()
655+
656+
# make rank 0 unresponsive to auth pin requests
657+
p = self.run_ceph_cmd("tell", f"mds.{self.fs.id}:1", "lockup", "30000", wait=False)
658+
659+
qops = []
660+
for d in dirs:
661+
path = os.path.join(self.mntpnt, d)
662+
op = self.fs.rank_tell("quiesce", "path", path, rank=0)['op']
663+
reqid = self.reqid_tostr(op['reqid'])
664+
log.info(f"created {reqid}")
665+
qops.append(reqid)
666+
667+
def find_quiesce(blocked_on_remote_auth_pin):
668+
# verify no quiesce ops
669+
ops = self.fs.get_ops(locks=False, rank=0, path="/tmp/mds.0-ops", status=status)['ops']
670+
for op in ops:
671+
type_data = op['type_data']
672+
flag_point = type_data['flag_point']
673+
op_type = type_data['op_type']
674+
if op_type == 'client_request' or op_type == 'peer_request':
675+
continue
676+
if type_data['op_name'] == "quiesce_inode":
677+
if blocked_on_remote_auth_pin:
678+
if flag_point == "requesting remote authpins":
679+
return True
680+
else:
681+
return True
682+
return False
683+
684+
with safe_while(sleep=1, tries=30, action='wait for quiesce op with outstanding remote authpin requests') as proceed:
685+
while proceed():
686+
if find_quiesce(True):
687+
break
688+
689+
# okay, now kill all quiesce ops
690+
for reqid in qops:
691+
self.fs.kill_op(reqid, rank=0)
692+
693+
# verify some quiesce_inode ops still exist because authpin acks have not been received
694+
if not find_quiesce(True):
695+
self.fail("did not find quiesce_inode op blocked on remote authpins!")
696+
697+
# wait for sleep to complete
698+
p.wait()
699+
700+
with safe_while(sleep=1, tries=30, action='wait for quiesce kill') as proceed:
701+
while proceed():
702+
if not find_quiesce(False):
703+
break
641704

642705
def test_quiesce_path_multirank(self):
643706
"""

src/mds/MDCache.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9818,6 +9818,11 @@ void MDCache::dispatch_request(const MDRequestRef& mdr)
98189818
} else if (mdr->peer_request) {
98199819
mds->server->dispatch_peer_request(mdr);
98209820
} else {
9821+
if (mdr->aborted) {
9822+
mdr->aborted = false;
9823+
request_kill(mdr);
9824+
return;
9825+
}
98219826
switch (mdr->internal_op) {
98229827
case CEPH_MDS_OP_QUIESCE_PATH:
98239828
dispatch_quiesce_path(mdr);

src/mds/MDSDaemon.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,13 @@ void MDSDaemon::asok_command(
149149
if (command == "status") {
150150
dump_status(f);
151151
r = 0;
152+
} else if (command == "lockup") {
153+
int64_t millisecs;
154+
cmd_getval(cmdmap, "millisecs", millisecs);
155+
derr << "(lockup) sleeping with mds_lock for " << millisecs << dendl;
156+
std::lock_guard l(mds_lock);
157+
std::this_thread::sleep_for(std::chrono::milliseconds(millisecs));
158+
r = 0;
152159
} else if (command == "exit") {
153160
outbl.append("Exiting...\n");
154161
r = 0;
@@ -256,6 +263,11 @@ void MDSDaemon::set_up_admin_socket()
256263
r = admin_socket->register_command("status", asok_hook,
257264
"high-level status of MDS");
258265
ceph_assert(r == 0);
266+
r = admin_socket->register_command("lockup "
267+
"name=millisecs,type=CephInt,req=true,range=0"
268+
, asok_hook
269+
, "sleep with mds_lock held (dev)");
270+
ceph_assert(r == 0);
259271
r = admin_socket->register_command("dump_ops_in_flight", asok_hook,
260272
"show the ops currently in flight");
261273
ceph_assert(r == 0);

0 commit comments

Comments
 (0)