Skip to content

Commit 9d3f590

Browse files
committed
Merge PR ceph#57332 into main
* refs/pull/57332/head: mds/quiesce: drop remote authpins before waiting for the quiesce lock qa/cephfs/test_quiesce: test proper handling of remote authpins mds: don't clear `AUTHPIN_FROZEN` until `FROZEN` in rename_prep mds: enhance the `lock path` asok command mds/quiesce: overdrive fragmenting that's still freezing revert: mds: provide a mechanism to authpin while freezing qa/cephfs/test_quiesce: enhance the fragmentation test Reviewed-by: Patrick Donnelly <[email protected]>
2 parents d63855c + 5692f7f commit 9d3f590

22 files changed

+413
-204
lines changed

qa/tasks/cephfs/test_quiesce.py

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,14 @@ def _verify_quiesce_wrapped(self, rank, status, root, root_inode, ops, cache, sp
331331
# check request/cap count is stopped
332332
# count inodes under /usr and count subops!
333333

334+
def quiesce_and_verify(self, path, timeout=120):
335+
J = self.fs.rank_tell("quiesce", "path", path)
336+
log.debug(f"{J}")
337+
reqid = self._reqid_tostr(J['op']['reqid'])
338+
self._wait_for_quiesce_complete(reqid, timeout=timeout)
339+
self._verify_quiesce(root=path)
340+
return reqid
341+
334342
class TestQuiesce(QuiesceTestCase):
335343
"""
336344
Single rank functional tests.
@@ -598,21 +606,39 @@ def test_quiesce_dir_fragment(self):
598606
That quiesce completes with fragmentation in the background.
599607
"""
600608

601-
self.config_set('mds', 'mds_bal_split_size', '10')
602-
self.config_set('mds', 'mds_bal_merge_size', '1') # do not merge
603-
self.config_set('mds', 'mds_bal_split_bits', '1')
604-
self._configure_subvolume()
605-
self._client_background_workload()
606-
607-
# time for the workload to get busy
608-
time.sleep(5)
609+
# the config should cause continuous merge-split wars
610+
self.config_set('mds', 'mds_bal_split_size', '1') # split anything larger than one item ....
611+
self.config_set('mds', 'mds_bal_merge_size', '2') # and then merge if only one item ]:-}
612+
self.config_set('mds', 'mds_bal_split_bits', '2')
609613

610-
J = self.fs.rank_tell("quiesce", "path", self.subvolume)
611-
log.debug(f"{J}")
612-
reqid = self._reqid_tostr(J['op']['reqid'])
613-
self._wait_for_quiesce_complete(reqid)
614-
self._verify_quiesce(root=self.subvolume)
614+
self._configure_subvolume()
615615

616+
self.mount_a.run_shell_payload("mkdir -p root/sub1")
617+
self.mount_a.write_file("root/sub1/file1", "I'm file 1")
618+
self.mount_a.run_shell_payload("mkdir -p root/sub2")
619+
self.mount_a.write_file("root/sub2/file2", "I'm file 2")
620+
621+
sleep_for = 30
622+
log.info(f"Sleeping {sleep_for} seconds to warm up the balancer")
623+
time.sleep(sleep_for)
624+
625+
for _ in range(30):
626+
sub1 = f"{self.subvolume}/root/sub1"
627+
log.debug(f"Quiescing {sub1}")
628+
# with one of the subdirs quiesced, the freezing
629+
# of the parent dir (root) can't complete
630+
op1 = self.quiesce_and_verify(sub1, timeout=15)
631+
632+
sub2 = f"{self.subvolume}/root/sub2"
633+
log.debug(f"{sub1} quiesced: {op1}. Quiescing {sub2}")
634+
# despite the parent dir freezing, we should be able
635+
# to quiesce the other subvolume
636+
op2 = self.quiesce_and_verify(sub2, timeout=15)
637+
638+
log.debug(f"{sub2} quiesced: {op2}. Killing the ops.")
639+
self.fs.kill_op(op1)
640+
self.fs.kill_op(op2)
641+
time.sleep(5)
616642

617643
class TestQuiesceMultiRank(QuiesceTestCase):
618644
"""
@@ -661,6 +687,50 @@ def test_quiesce_path_splitauth(self):
661687
op = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--wait'], rank=0)['op']
662688
self.assertEqual(op['result'], -1) # EPERM
663689

690+
@unittest.skip("https://tracker.ceph.com/issues/66152")
691+
def test_quiesce_drops_remote_authpins_on_failure(self):
692+
"""
693+
That remote authpins are dropped when the request fails to acquire the quiesce lock
694+
695+
When the remote authpin is freezing, not dropping it is likely to deadlock a distributed quiesce
696+
"""
697+
self._configure_subvolume()
698+
699+
# create two dirs for pinning
700+
self.mount_a.run_shell_payload("mkdir -p pin0 pin1")
701+
# enable export by populating the directories
702+
self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy")
703+
# pin the files to different ranks
704+
self.mount_a.setfattr("pin0", "ceph.dir.pin", "0")
705+
self.mount_a.setfattr("pin1", "ceph.dir.pin", "1")
706+
707+
# prepare the patient at rank 0
708+
self.mount_a.write_file("pin0/thefile", "I'm ready, doc")
709+
710+
# wait for the export to settle
711+
self._wait_subtrees([(f"{self.mntpnt}/pin0", 0), (f"{self.mntpnt}/pin1", 1)])
712+
713+
# Take the quiesce lock on the replica of the src file.
714+
# This is needed to cause the next command to block
715+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 lock path {self.mntpnt}/pin0/thefile quiesce:x --await")
716+
self.assertEqual(cmd.exitstatus, 0)
717+
718+
# Simulate a rename by remote-auth-pin-freezing the file.
719+
# Also try to take the quiesce lock to cause the MDR
720+
# to block on quiesce with the remote authpin granted
721+
# Don't --await this time because we expect this to block
722+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 lock path {self.mntpnt}/pin0/thefile quiesce:w --ap-freeze")
723+
self.assertEqual(cmd.exitstatus, 0)
724+
725+
# At this point, if everything works well, we should be able to
726+
# autpin and quiesce the file on the auth side.
727+
# If the op above fails to release remote authpins, then the inode
728+
# will still be authpin frozen, and that will disallow auth pinning the file
729+
# We are using a combination of --ap-dont-block and --await
730+
# to detect whether the file is authpinnable
731+
cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:0 lock path {self.mntpnt}/pin0/thefile quiesce:x --ap-dont-block --await")
732+
self.assertEqual(cmd.exitstatus, 0)
733+
664734
def test_quiesce_authpin_wait(self):
665735
"""
666736
That a quiesce_inode op with outstanding remote authpin requests can be killed.
@@ -804,7 +874,7 @@ def test_quiesce_block_replicated(self):
804874
op = self.fs.rank_tell("lock", "path", self.mntpnt+"/dir1/dir2", "policy:r", rank=1)
805875
p = self.mount_a.setfattr("dir1/dir2", "ceph.quiesce.block", "1", wait=False)
806876
sleep(2) # for req to block waiting for xlock on policylock
807-
reqid = self._reqid_tostr(op['op']['reqid'])
877+
reqid = self._reqid_tostr(op['reqid'])
808878
self.fs.kill_op(reqid, rank=1)
809879
p.wait()
810880

qa/tasks/vstart_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1375,7 +1375,7 @@ def exec_test():
13751375
elif f == '--run-all-tests':
13761376
opt_exit_on_test_failure = False
13771377
elif f == '--debug':
1378-
log.setLevel(logging.DEBUG)
1378+
logging.root.setLevel(logging.DEBUG)
13791379
elif f == '--config-mode':
13801380
mode = Mode.config
13811381
else:

src/common/TrackedOp.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -204,26 +204,27 @@ class OpTracker {
204204
}
205205
~OpTracker();
206206

207-
template <typename T, typename U>
208-
typename T::Ref create_request(U params)
207+
// NB: P is ref-like, i.e. `params` should be dereferenced for members
208+
template <typename R, typename P>
209+
typename R::Ref create_request(P params)
209210
{
210-
constexpr bool has_is_continuous = requires(U u) {
211-
{ u->is_continuous() } -> std::same_as<bool>;
211+
constexpr bool enable_mark_continuous = requires(typename R::Ref r, P p) {
212+
{ p->is_continuous() } -> std::same_as<bool>;
213+
r->mark_continuous();
212214
};
213-
typename T::Ref retval(new T(params, this));
215+
typename R::Ref retval(new R(params, this));
214216
retval->tracking_start();
215217
if (is_tracking()) {
216218
retval->mark_event("header_read", params->get_recv_stamp());
217219
retval->mark_event("throttled", params->get_throttle_stamp());
218220
retval->mark_event("all_read", params->get_recv_complete_stamp());
219221
retval->mark_event("dispatched", params->get_dispatch_stamp());
220222
}
221-
if constexpr (has_is_continuous) {
223+
if constexpr (enable_mark_continuous) {
222224
if (params->is_continuous()) {
223225
retval->mark_continuous();
224226
}
225227
}
226-
227228
return retval;
228229
}
229230
};

src/mds/CDentry.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,10 +368,10 @@ int CDentry::get_num_dir_auth_pins() const
368368
return auth_pins;
369369
}
370370

371-
bool CDentry::can_auth_pin(int *err_ret, bool bypassfreezing) const
371+
bool CDentry::can_auth_pin(int *err_ret) const
372372
{
373373
ceph_assert(dir);
374-
return dir->can_auth_pin(err_ret, bypassfreezing);
374+
return dir->can_auth_pin(err_ret);
375375
}
376376

377377
void CDentry::auth_pin(void *by)

src/mds/CDentry.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry>
222222
void _put() override;
223223

224224
// auth pins
225-
bool can_auth_pin(int *err_ret=nullptr, bool bypassfreezing=false) const override;
225+
bool can_auth_pin(int *err_ret=nullptr) const override;
226226
void auth_pin(void *by) override;
227227
void auth_unpin(void *by) override;
228228
void adjust_nested_auth_pins(int diradj, void *by);

src/mds/CDir.cc

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3450,23 +3450,16 @@ void CDir::adjust_freeze_after_rename(CDir *dir)
34503450
mdcache->mds->queue_waiters(unfreeze_waiters);
34513451
}
34523452

3453-
bool CDir::can_auth_pin(int *err_ret, bool bypassfreezing) const
3453+
bool CDir::can_auth_pin(int *err_ret) const
34543454
{
34553455
int err;
34563456
if (!is_auth()) {
34573457
err = ERR_NOT_AUTH;
3458-
} else if (is_freezing_dir()) {
3459-
if (bypassfreezing) {
3460-
dout(20) << "allowing authpin with freezing" << dendl;
3461-
err = 0;
3462-
} else {
3463-
err = ERR_FRAGMENTING_DIR;
3464-
}
3465-
} else if (is_frozen_dir()) {
3458+
} else if (is_freezing_dir() || is_frozen_dir()) {
34663459
err = ERR_FRAGMENTING_DIR;
34673460
} else {
34683461
auto p = is_freezing_or_frozen_tree();
3469-
if (p.first && !bypassfreezing) {
3462+
if (p.first) {
34703463
err = ERR_EXPORTING_TREE;
34713464
} else if (p.second) {
34723465
err = ERR_EXPORTING_TREE;

src/mds/CDir.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ class CDir : public MDSCacheObject, public Counter<CDir> {
527527
void abort_import();
528528

529529
// -- auth pins --
530-
bool can_auth_pin(int *err_ret=nullptr, bool bypassfreezing=false) const override;
530+
bool can_auth_pin(int *err_ret=nullptr) const override;
531531
int get_auth_pins() const { return auth_pins; }
532532
int get_dir_auth_pins() const { return dir_auth_pins; }
533533
void auth_pin(void *who) override;

src/mds/CInode.cc

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2970,22 +2970,15 @@ void CInode::clear_ambiguous_auth()
29702970
}
29712971

29722972
// auth_pins
2973-
bool CInode::can_auth_pin(int *err_ret, bool bypassfreezing) const {
2973+
bool CInode::can_auth_pin(int *err_ret) const {
29742974
int err;
29752975
if (!is_auth()) {
29762976
err = ERR_NOT_AUTH;
2977-
} else if (is_freezing_inode()) {
2978-
if (bypassfreezing) {
2979-
dout(20) << "allowing authpin with freezing" << dendl;
2980-
err = 0;
2981-
} else {
2982-
err = ERR_EXPORTING_INODE;
2983-
}
2984-
} else if (is_frozen_inode() || is_frozen_auth_pin()) {
2977+
} else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
29852978
err = ERR_EXPORTING_INODE;
29862979
} else {
29872980
if (parent)
2988-
return parent->can_auth_pin(err_ret, bypassfreezing);
2981+
return parent->can_auth_pin(err_ret);
29892982
err = 0;
29902983
}
29912984
if (err && err_ret)
@@ -5605,4 +5598,23 @@ void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
56055598
}
56065599
}
56075600

5601+
bool CInode::is_quiesced() const {
5602+
if (!quiescelock.is_xlocked()) {
5603+
return false;
5604+
}
5605+
// check that it's the quiesce op that's holding the lock
5606+
auto mut = quiescelock.get_xlock_by();
5607+
ceph_assert(mut); /* that would be weird */
5608+
auto* mdr = dynamic_cast<MDRequestImpl*>(mut.get());
5609+
ceph_assert(mdr); /* also would be weird */
5610+
return mdr->internal_op == CEPH_MDS_OP_QUIESCE_INODE;
5611+
}
5612+
5613+
bool CInode::will_block_for_quiesce(const MDRequestRef& mdr) {
5614+
if (mdr && mdr->is_wrlocked(&quiescelock)) {
5615+
return false;
5616+
}
5617+
return !quiescelock.can_wrlock();
5618+
}
5619+
56085620
MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);

src/mds/CInode.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
662662
bool is_file() const { return get_inode()->is_file(); }
663663
bool is_symlink() const { return get_inode()->is_symlink(); }
664664
bool is_dir() const { return get_inode()->is_dir(); }
665-
bool is_quiesced() const { return quiescelock.is_xlocked(); }
665+
bool is_quiesced() const;
666+
bool will_block_for_quiesce(const MDRequestRef& mdr = MDRequestRef {});
666667

667668
bool is_head() const { return last == CEPH_NOSNAP; }
668669

@@ -930,7 +931,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
930931
mds_authority_t authority() const override;
931932

932933
// -- auth pins --
933-
bool can_auth_pin(int *err_ret=nullptr, bool bypassfreezing=false) const override;
934+
bool can_auth_pin(int *err_ret=nullptr) const override;
934935
void auth_pin(void *by) override;
935936
void auth_unpin(void *by) override;
936937

0 commit comments

Comments
 (0)