Skip to content

Commit 25e4ee2

Browse files
committed
Merge PR ceph#57579 into main
* refs/pull/57579/head: mds/quiesce: disable quiesce root debug parameters by default mds/quiesce-agt: never send a synchronous ack mds/quiesce-agt: add test for a rapid async ack mds/quiesce: always abort fragmenting asynchronously to prevent reentrancy mds/quiesce: overdrive an export if it hasn't frozen the tree yet mds/quiesce: quiesce_inode should not hold on to remote auth pins qa/cephfs: check that a completed quiesce doesn't hold remote auth pins mds: add `--lifetime` parameter to the `lock path` asok command mds/quiesce: accept a regular file as the quiesce root mds: command_quiesce_path: rename `--wait` to `--await` for consistency mds: command_quiesce_path: do not block the asok thread and return an adequate rc Reviewed-by: Patrick Donnelly <[email protected]>
2 parents 05cd407 + a9cb3a5 commit 25e4ee2

File tree

16 files changed

+414
-212
lines changed

16 files changed

+414
-212
lines changed

qa/tasks/cephfs/test_quiesce.py

Lines changed: 140 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -461,18 +461,28 @@ def resumed():
461461

462462
def test_quiesce_path_link_terminal(self):
463463
"""
464-
That quiesce on path with an terminal link fails with ENOTDIR even
465-
pointing to a valid subvolume.
464+
That quiesce on path with an terminal link quiesces just the link inode
466465
"""
467466

468467
self._configure_subvolume()
469468

470-
self.mount_a.run_shell_payload("ln -s ../.. subvol_quiesce")
471-
path = self.mount_a.cephfs_mntpt + "/subvol_quiesce"
469+
self.mount_a.run_shell_payload("mkdir -p dir/")
470+
self.mount_a.write_file("dir/afile", "I'm a file")
471+
self.mount_a.run_shell_payload("ln -s dir symlink_to_dir")
472+
path = self.mount_a.cephfs_mntpt + "/symlink_to_dir"
472473

473-
J = self.fs.rank_tell(["quiesce", "path", path, '--wait'])
474-
log.debug(f"{J}")
475-
self.assertEqual(J['op']['result'], -20) # ENOTDIR: the link is not a directory
474+
# MDS doesn't treat symlinks differently from regular inodes,
475+
# so quiescing one is allowed
476+
self.quiesce_and_verify(path)
477+
478+
# however, this also means that the directory this symlink points to isn't quiesced
479+
ops = self.fs.get_ops()
480+
quiesce_inode = 0
481+
for op in ops['ops']:
482+
op_name = op['type_data'].get('op_name', None)
483+
if op_name == "quiesce_inode":
484+
quiesce_inode += 1
485+
self.assertEqual(1, quiesce_inode)
476486

477487
def test_quiesce_path_link_intermediate(self):
478488
"""
@@ -484,7 +494,7 @@ def test_quiesce_path_link_intermediate(self):
484494
self.mount_a.run_shell_payload("ln -s ../../.. _nogroup")
485495
path = self.mount_a.cephfs_mntpt + "/_nogroup/" + self.QUIESCE_SUBVOLUME
486496

487-
J = self.fs.rank_tell(["quiesce", "path", path, '--wait'])
497+
J = self.fs.rank_tell(["quiesce", "path", path, '--await'], check_status=False)
488498
log.debug(f"{J}")
489499
self.assertEqual(J['op']['result'], -20) # ENOTDIR: path_traverse: the intermediate link is not a directory
490500

@@ -498,24 +508,24 @@ def test_quiesce_path_notsubvol(self):
498508
self.mount_a.run_shell_payload("mkdir dir")
499509
path = self.mount_a.cephfs_mntpt + "/dir"
500510

501-
J = self.fs.rank_tell(["quiesce", "path", path, '--wait'])
511+
J = self.fs.rank_tell(["quiesce", "path", path, '--await'], check_status=False)
502512
reqid = self._reqid_tostr(J['op']['reqid'])
503513
self._wait_for_quiesce_complete(reqid, path=path)
504514
self._verify_quiesce(root=path)
505515

506516
def test_quiesce_path_regfile(self):
507517
"""
508-
That quiesce on a regular file fails with ENOTDIR.
518+
That quiesce on a regular file is possible.
509519
"""
510520

511521
self._configure_subvolume()
512522

513523
self.mount_a.run_shell_payload("touch file")
514524
path = self.mount_a.cephfs_mntpt + "/file"
515525

516-
J = self.fs.rank_tell(["quiesce", "path", path, '--wait'])
526+
J = self.fs.rank_tell(["quiesce", "path", path, '--await'], check_status=False)
517527
log.debug(f"{J}")
518-
self.assertEqual(J['op']['result'], -20) # ENOTDIR
528+
self.assertEqual(J['op']['result'], 0)
519529

520530
def test_quiesce_path_dup(self):
521531
"""
@@ -525,9 +535,9 @@ def test_quiesce_path_dup(self):
525535

526536
self._configure_subvolume()
527537

528-
op1 = self.fs.rank_tell(["quiesce", "path", self.subvolume])['op']
538+
op1 = self.fs.rank_tell(["quiesce", "path", self.subvolume], check_status=False)['op']
529539
op1_reqid = self._reqid_tostr(op1['reqid'])
530-
op2 = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--wait'])['op']
540+
op2 = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--await'], check_status=False)['op']
531541
op1 = self.fs.get_op(op1_reqid)['type_data'] # for possible dup result
532542
log.debug(f"op1 = {op1}")
533543
log.debug(f"op2 = {op2}")
@@ -545,7 +555,7 @@ def test_quiesce_blocked(self):
545555
self.mount_a.run_shell_payload("touch file")
546556
self.mount_a.setfattr("file", "ceph.quiesce.block", "1")
547557

548-
J = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--wait'])
558+
J = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--await'], check_status=False)
549559
log.debug(f"{J}")
550560
self.assertEqual(J['op']['result'], 0)
551561
self.assertEqual(J['state']['inodes_blocked'], 1)
@@ -560,7 +570,7 @@ def test_quiesce_slow(self):
560570
self._configure_subvolume()
561571
self._client_background_workload()
562572

563-
J = self.fs.rank_tell(["quiesce", "path", self.subvolume])
573+
J = self.fs.rank_tell(["quiesce", "path", self.subvolume], check_status=False)
564574
log.debug(f"{J}")
565575
reqid = self._reqid_tostr(J['op']['reqid'])
566576
self._wait_for_quiesce_complete(reqid)
@@ -582,7 +592,7 @@ def test_quiesce_find(self):
582592
# drop cache
583593
self.fs.rank_tell(["cache", "drop"])
584594

585-
J = self.fs.rank_tell(["quiesce", "path", self.subvolume])
595+
J = self.fs.rank_tell(["quiesce", "path", self.subvolume], check_status=False)
586596
log.debug(f"{J}")
587597
reqid = self._reqid_tostr(J['op']['reqid'])
588598
self._wait_for_quiesce_complete(reqid)
@@ -684,11 +694,69 @@ def test_quiesce_path_splitauth(self):
684694
self._client_background_workload()
685695
self._wait_distributed_subtrees(2*2, rank="all", path=self.mntpnt)
686696

687-
op = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--wait'], rank=0)['op']
697+
op = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--await'], rank=0, check_status=False)['op']
688698
self.assertEqual(op['result'], -1) # EPERM
689699

690-
@unittest.skip("https://tracker.ceph.com/issues/66152")
691-
def test_quiesce_drops_remote_authpins_on_failure(self):
700+
def test_quiesce_drops_remote_authpins_when_done(self):
701+
"""
702+
That a quiesce operation drops remote authpins after marking the node as quiesced
703+
704+
It's important that a remote quiesce doesn't stall freezing ops on the auth
705+
"""
706+
self._configure_subvolume()
707+
708+
# create two dirs for pinning
709+
self.mount_a.run_shell_payload("mkdir -p pin0 pin1")
710+
# enable export by populating the directories
711+
self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy")
712+
# pin the files to different ranks
713+
self.mount_a.setfattr("pin0", "ceph.dir.pin", "0")
714+
self.mount_a.setfattr("pin1", "ceph.dir.pin", "1")
715+
716+
# prepare the patient at rank 0
717+
self.mount_a.write_file("pin0/thefile", "I'm ready, doc")
718+
719+
# wait for the export to settle
720+
self._wait_subtrees([(f"{self.mntpnt}/pin0", 0), (f"{self.mntpnt}/pin1", 1)])
721+
722+
def reqid(cmd):
723+
J = json.loads(cmd.stdout.getvalue())
724+
J = J.get('type_data', J) # for op get
725+
J = J.get('op', J) # for quiesce path
726+
# lock path returns the op directly
727+
return self._reqid_tostr(J['reqid'])
728+
729+
def assertQuiesceOpDone(expected_done, quiesce_op, rank):
730+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:{rank} op get {quiesce_op}", stdout=StringIO())
731+
732+
J = json.loads(cmd.stdout.getvalue())
733+
self.assertEqual(J['type_data']['result'], 0 if expected_done else None)
734+
735+
# Take the policy lock on the auth to cause a quiesce operation to request the remote authpin
736+
# This is needed to cause the next command to block
737+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:0 lock path {self.mntpnt}/pin0/thefile policy:x --await", stdout=StringIO())
738+
policy_block_op = reqid(cmd)
739+
740+
# Try quiescing on the replica. This should block for the policy lock
741+
# As a side effect, it should take the remote authpin
742+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 quiesce path {self.mntpnt}/pin0/thefile", stdout=StringIO())
743+
quiesce_op = reqid(cmd)
744+
745+
# verify the quiesce is pending
746+
assertQuiesceOpDone(False, quiesce_op, rank=1)
747+
748+
# kill the op that holds the policy lock exclusively and verify the quiesce succeeds
749+
self.fs.kill_op(policy_block_op, rank=0)
750+
assertQuiesceOpDone(True, quiesce_op, rank=1)
751+
752+
# If all is good, the ap-freeze operation below should succeed
753+
# despite the quiesce_op that's still active.
754+
# We payload this with some lock that we know shouldn't block
755+
# The call below will block on freezing if the quiesce failed to release
756+
# remote authpins, and after the lifetime elapses will return ECANCELED
757+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 lock path {self.mntpnt}/pin0/thefile policy:r --ap-freeze --await --lifetime 5")
758+
759+
def test_request_drops_remote_authpins_when_waiting_for_quiescelock(self):
692760
"""
693761
That remote authpins are dropped when the request fails to acquire the quiesce lock
694762
@@ -736,65 +804,71 @@ def test_quiesce_authpin_wait(self):
736804
That a quiesce_inode op with outstanding remote authpin requests can be killed.
737805
"""
738806

739-
self.config_set('mds', 'mds_heartbeat_grace', '120')
740-
self.fs.set_session_timeout(240) # avoid spurious session warnings
741-
self._configure_subvolume()
742-
self.mount_a.setfattr(".", "ceph.dir.pin.distributed", "1")
743-
self._client_background_workload()
744-
self._wait_distributed_subtrees(2*2, rank="all", path=self.mntpnt)
745-
status = self.fs.status()
807+
# create two dirs for pinning
808+
self.mount_a.run_shell_payload("mkdir -p pin0 pin1")
809+
# enable export by populating the directories
810+
self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy")
811+
# pin the files to different ranks
812+
self.mount_a.setfattr("pin0", "ceph.dir.pin", "0")
813+
self.mount_a.setfattr("pin1", "ceph.dir.pin", "1")
746814

747-
p = self.mount_a.run_shell_payload("ls", stdout=StringIO())
748-
dirs = p.stdout.getvalue().strip().split()
815+
# prepare the patient at rank 0
816+
self.mount_a.write_file("pin0/thefile", "I'm ready, doc")
749817

750-
# make rank 1 unresponsive to auth pin requests
751-
p = self.run_ceph_cmd("tell", f"mds.{self.fs.id}:1", "lockup", "90000", wait=False)
818+
# wait for the export to settle
819+
self._wait_subtrees([("/pin0", 0), ("/pin1", 1)])
752820

753-
qops = []
754-
for d in dirs:
755-
path = os.path.join(self.mntpnt, d)
756-
op = self.fs.rank_tell("quiesce", "path", path, rank=0)['op']
757-
reqid = self._reqid_tostr(op['reqid'])
758-
log.info(f"created {reqid}")
759-
qops.append(reqid)
821+
path = "/pin0/thefile"
822+
823+
# take the policy lock on the file to cause remote authpin from the replica
824+
# when we get to quiescing the path
825+
op = self.fs.rank_tell("lock", "path", path, "policy:x", "--await", rank=0)
826+
policy_reqid = self._reqid_tostr(op['reqid'])
760827

761-
def find_quiesce(blocked_on_remote_auth_pin):
762-
# verify no quiesce ops
763-
ops = self.fs.get_ops(locks=False, rank=0, path="/tmp/mds.0-ops", status=status)['ops']
828+
# We need to simulate a freezing inode to have quiesce block on the authpin.
829+
# This can be done with authpin freeze feature, but it only works when sent from the replica.
830+
# We'll rdlock the policy lock, but it doesn't really matter as the quiesce won't get that far
831+
op = self.fs.rank_tell("lock", "path", path, "policy:r", "--ap-freeze", rank=1)
832+
freeze_reqid = self._reqid_tostr(op['reqid'])
833+
834+
# we should quiesce the same path from the replica side to cause the remote authpin (due to file xlock)
835+
op = self.fs.rank_tell("quiesce", "path", path, rank=1)['op']
836+
quiesce_reqid = self._reqid_tostr(op['reqid'])
837+
838+
def has_quiesce(*, blocked_on_remote_auth_pin):
839+
ops = self.fs.get_ops(locks=False, rank=1, path="/tmp/mds.1-ops")['ops']
840+
log.debug(ops)
764841
for op in ops:
765842
type_data = op['type_data']
766843
flag_point = type_data['flag_point']
767-
op_type = type_data['op_type']
768-
if op_type == 'client_request' or op_type == 'peer_request':
769-
continue
770844
if type_data['op_name'] == "quiesce_inode":
771845
if blocked_on_remote_auth_pin:
772-
if flag_point == "requesting remote authpins":
773-
return True
774-
else:
775-
return True
846+
self.assertEqual("requesting remote authpins", flag_point)
847+
return True
776848
return False
777849

778-
with safe_while(sleep=1, tries=30, action='wait for quiesce op with outstanding remote authpin requests') as proceed:
779-
while proceed():
780-
if find_quiesce(True):
781-
break
850+
# The quiesce should be pending
851+
self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True))
782852

783-
# okay, now kill all quiesce ops
784-
for reqid in qops:
785-
self.fs.kill_op(reqid, rank=0)
853+
# even after killing the quiesce op, it should still stay pending
854+
self.fs.kill_op(quiesce_reqid, rank=1)
855+
self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True))
786856

787-
# verify some quiesce_inode ops still exist because authpin acks have not been received
788-
if not find_quiesce(True):
789-
self.fail("did not find quiesce_inode op blocked on remote authpins! (did the lockup on rank 1 complete?)")
857+
# first, kill the policy xlock to release the freezing request
858+
self.fs.kill_op(policy_reqid, rank=0)
859+
time.sleep(1)
790860

791-
# wait for sleep to complete
792-
p.wait()
861+
# this should have let the freezing request to progress
862+
op = self.fs.rank_tell("op", "get", freeze_reqid, rank=1)
863+
log.debug(op)
864+
self.assertEqual(0, op['type_data']['result'])
865+
866+
# now unfreeze the inode
867+
self.fs.kill_op(freeze_reqid, rank=1)
868+
time.sleep(1)
793869

794-
with safe_while(sleep=1, tries=30, action='wait for quiesce kill') as proceed:
795-
while proceed():
796-
if not find_quiesce(False):
797-
break
870+
# the quiesce op should be gone
871+
self.assertFalse(has_quiesce(blocked_on_remote_auth_pin=False))
798872

799873
def test_quiesce_block_file_replicated(self):
800874
"""
@@ -934,8 +1008,8 @@ def test_quiesce_path_multirank_exports(self):
9341008

9351009
sleep(2)
9361010

937-
op0 = self.fs.rank_tell(["quiesce", "path", self.subvolume], rank=0)['op']
938-
op1 = self.fs.rank_tell(["quiesce", "path", self.subvolume], rank=1)['op']
1011+
op0 = self.fs.rank_tell(["quiesce", "path", self.subvolume], rank=0, check_status=False)['op']
1012+
op1 = self.fs.rank_tell(["quiesce", "path", self.subvolume], rank=1, check_status=False)['op']
9391013
reqid0 = self._reqid_tostr(op0['reqid'])
9401014
reqid1 = self._reqid_tostr(op1['reqid'])
9411015
op0 = self._wait_for_quiesce_complete(reqid0, rank=0, timeout=300)

src/mds/Locker.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,6 @@ struct MarkEventOnDestruct {
230230
bool Locker::acquire_locks(const MDRequestRef& mdr,
231231
MutationImpl::LockOpVec& lov,
232232
CInode* auth_pin_freeze,
233-
std::set<MDSCacheObject*> mustpin,
234233
bool auth_pin_nonblocking,
235234
bool skip_quiesce)
236235
{
@@ -245,6 +244,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
245244

246245
client_t client = mdr->get_client();
247246

247+
std::set<MDSCacheObject*> mustpin;
248248
if (auth_pin_freeze)
249249
mustpin.insert(auth_pin_freeze);
250250

@@ -294,7 +294,10 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
294294

295295
dout(20) << " must xlock " << *lock << " " << *object << dendl;
296296

297-
mustpin.insert(object);
297+
// only take the authpin for the quiesce lock on the auth
298+
if (lock->get_type() != CEPH_LOCK_IQUIESCE || object->is_auth()) {
299+
mustpin.insert(object);
300+
}
298301

299302
// augment xlock with a versionlock?
300303
if (lock->get_type() == CEPH_LOCK_DN) {

src/mds/Locker.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ class Locker {
5454
bool acquire_locks(const MDRequestRef& mdr,
5555
MutationImpl::LockOpVec& lov,
5656
CInode *auth_pin_freeze=NULL,
57-
std::set<MDSCacheObject*> mustpin = {},
5857
bool auth_pin_nonblocking=false,
5958
bool skip_quiesce=false);
6059

0 commit comments

Comments
 (0)