Skip to content

Commit 0c4d835

Browse files
committed
Merge PR ceph#49773 into main
* refs/pull/49773/head: mds: add config to decide whether to mark dentry bad qa: add missing scan_links step for data scan recovery qa/tasks/cephfs: test damage to dentry's first is caught qa/tasks/cephfs: use rank_asok and allow specifying rank qa/tasks: allow specifying timeout command prefix to ceph mds: provide test configs for creating first corruption mds: catch damage to dentry's first field mds: add debugging for pre_cow_old_inode mds: cleanup code Reviewed-by: Kotresh Hiremath Ravishankar <[email protected]>
2 parents c3f1eee + 7ffa065 commit 0c4d835

File tree

19 files changed

+257
-30
lines changed

19 files changed

+257
-30
lines changed

qa/suites/fs/functional/tasks/damage.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ overrides:
1818
- Metadata damage detected
1919
- MDS_READ_ONLY
2020
- force file system read-only
21+
- with standby daemon mds
2122
tasks:
2223
- cephfs_test_runner:
2324
modules:

qa/tasks/ceph_manager.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,19 +1564,28 @@ def run_cluster_cmd(self, **kwargs):
15641564
elif isinstance(kwargs['args'], tuple):
15651565
kwargs['args'] = list(kwargs['args'])
15661566

1567+
prefixcmd = []
1568+
timeoutcmd = kwargs.pop('timeoutcmd', None)
1569+
if timeoutcmd is not None:
1570+
prefixcmd += ['timeout', str(timeoutcmd)]
1571+
15671572
if self.cephadm:
1573+
prefixcmd += ['ceph']
1574+
cmd = prefixcmd + list(kwargs['args'])
15681575
return shell(self.ctx, self.cluster, self.controller,
1569-
args=['ceph'] + list(kwargs['args']),
1576+
args=cmd,
15701577
stdout=StringIO(),
15711578
check_status=kwargs.get('check_status', True))
1572-
if self.rook:
1579+
elif self.rook:
1580+
prefixcmd += ['ceph']
1581+
cmd = prefixcmd + list(kwargs['args'])
15731582
return toolbox(self.ctx, self.cluster,
1574-
args=['ceph'] + list(kwargs['args']),
1583+
args=cmd,
15751584
stdout=StringIO(),
15761585
check_status=kwargs.get('check_status', True))
1577-
1578-
kwargs['args'] = self.CEPH_CMD + kwargs['args']
1579-
return self.controller.run(**kwargs)
1586+
else:
1587+
kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
1588+
return self.controller.run(**kwargs)
15801589

15811590
def raw_cluster_cmd(self, *args, **kwargs) -> str:
15821591
"""

qa/tasks/cephfs/filesystem.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,12 +1249,12 @@ def ranks_perf(self, f, status=None):
12491249
out.append((rank, f(perf)))
12501250
return out
12511251

1252-
def read_cache(self, path, depth=None):
1252+
def read_cache(self, path, depth=None, rank=None):
12531253
cmd = ["dump", "tree", path]
12541254
if depth is not None:
12551255
cmd.append(depth.__str__())
1256-
result = self.mds_asok(cmd)
1257-
if len(result) == 0:
1256+
result = self.rank_asok(cmd, rank=rank)
1257+
if result is None or len(result) == 0:
12581258
raise RuntimeError("Path not found in cache: {0}".format(path))
12591259

12601260
return result
@@ -1648,6 +1648,9 @@ def run_scrub(self, cmd, rank=0):
16481648
def get_scrub_status(self, rank=0):
16491649
return self.run_scrub(["status"], rank)
16501650

1651+
def flush(self, rank=0):
1652+
return self.rank_tell(["flush", "journal"], rank=rank)
1653+
16511654
def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
16521655
timeout=300, reverse=False):
16531656
# time out after "timeout" seconds and assume as done

qa/tasks/cephfs/test_damage.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import errno
55
import re
6+
import time
67
from teuthology.contextutil import MaxWhileTries
78
from teuthology.exceptions import CommandFailedError
89
from teuthology.orchestra.run import wait
@@ -562,3 +563,99 @@ def test_open_ino_errors(self):
562563
self.fs.mon_manager.raw_cluster_cmd(
563564
'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
564565
"damage", "rm", str(entry['id']))
566+
567+
def test_dentry_first_existing(self):
568+
"""
569+
That the MDS won't abort when the dentry is already known to be damaged.
570+
"""
571+
572+
def verify_corrupt():
573+
info = self.fs.read_cache("/a", 0)
574+
log.debug('%s', info)
575+
self.assertEqual(len(info), 1)
576+
dirfrags = info[0]['dirfrags']
577+
self.assertEqual(len(dirfrags), 1)
578+
dentries = dirfrags[0]['dentries']
579+
self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
580+
self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
581+
582+
self.mount_a.run_shell_payload("mkdir -p a/b")
583+
self.fs.flush()
584+
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
585+
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
586+
time.sleep(5) # for conf to percolate
587+
self.mount_a.run_shell_payload("mv a/b a/c; sync .")
588+
self.mount_a.umount()
589+
verify_corrupt()
590+
self.fs.fail()
591+
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
592+
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
593+
self.fs.set_joinable()
594+
status = self.fs.status()
595+
self.fs.flush()
596+
self.assertFalse(self.fs.status().hadfailover(status))
597+
verify_corrupt()
598+
599+
def test_dentry_first_preflush(self):
600+
"""
601+
That the MDS won't write a dentry with new damage to CDentry::first
602+
to the journal.
603+
"""
604+
605+
rank0 = self.fs.get_rank()
606+
self.fs.rank_freeze(True, rank=0)
607+
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
608+
self.fs.flush()
609+
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
610+
time.sleep(5) # for conf to percolate
611+
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
612+
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
613+
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
614+
self.fs.rank_freeze(False, rank=0)
615+
self.delete_mds_coredump(rank0['name'])
616+
self.fs.mds_restart(rank0['name'])
617+
self.fs.wait_for_daemons()
618+
p.wait()
619+
self.mount_a.run_shell_payload("stat a/ && find a/")
620+
self.fs.flush()
621+
622+
def test_dentry_first_precommit(self):
623+
"""
624+
That the MDS won't write a dentry with new damage to CDentry::first
625+
to the directory object.
626+
"""
627+
628+
fscid = self.fs.id
629+
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
630+
self.mount_a.umount() # allow immediate scatter write back
631+
self.fs.flush()
632+
# now just twiddle some inode metadata on a regular file
633+
self.mount_a.mount_wait()
634+
self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
635+
self.mount_a.umount() # avoid journaling session related things
636+
# okay, now cause the dentry to get damaged after loading from the journal
637+
self.fs.fail()
638+
self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
639+
time.sleep(5) # for conf to percolate
640+
self.fs.set_joinable()
641+
self.fs.wait_for_daemons()
642+
rank0 = self.fs.get_rank()
643+
self.fs.rank_freeze(True, rank=0)
644+
# so now we want to trigger commit but this will crash, so:
645+
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
646+
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
647+
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
648+
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
649+
self.fs.rank_freeze(False, rank=0)
650+
self.delete_mds_coredump(rank0['name'])
651+
self.fs.mds_restart(rank0['name'])
652+
self.fs.wait_for_daemons()
653+
try:
654+
p.wait()
655+
except CommandFailedError as e:
656+
print(e)
657+
else:
658+
self.fail("flush journal should fail!")
659+
self.mount_a.mount_wait()
660+
self.mount_a.run_shell_payload("stat a/ && find a/")
661+
self.fs.flush()

qa/tasks/cephfs/test_data_scan.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,7 @@ def get_state(mds_id):
402402
self.fs.data_scan(["init"])
403403
self.fs.data_scan(["scan_extents"], worker_count=workers)
404404
self.fs.data_scan(["scan_inodes"], worker_count=workers)
405+
self.fs.data_scan(["scan_links"])
405406

406407
# Mark the MDS repaired
407408
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

qa/tasks/cephfs/test_forward_scrub.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_orphan_scan(self):
129129
# Umount before flush to avoid cap releases putting
130130
# things we don't want in the journal later.
131131
self.mount_a.umount_wait()
132-
self.fs.mds_asok(["flush", "journal"])
132+
self.fs.flush()
133133

134134
# Create a new inode that's just in the log, i.e. would
135135
# look orphaned to backward scan if backward scan wisnae
@@ -163,7 +163,7 @@ def test_orphan_scan(self):
163163

164164
# Run a tagging forward scrub
165165
tag = "mytag123"
166-
self.fs.mds_asok(["tag", "path", "/parent", tag])
166+
self.fs.rank_asok(["tag", "path", "/parent", tag])
167167

168168
# See that the orphan wisnae tagged
169169
self.assertUntagged(inos['./parent/flushed/bravo'])
@@ -175,14 +175,21 @@ def test_orphan_scan(self):
175175
# See that journalled-but-not-flushed file *was* tagged
176176
self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
177177

178-
# Run cephfs-data-scan targeting only orphans
178+
# okay, now we are going to run cephfs-data-scan. It's necessary to
179+
# have a clean journal otherwise replay will blowup on mismatched
180+
# inotable versions (due to scan_links)
181+
self.fs.flush()
179182
self.fs.fail()
183+
self.fs.journal_tool(["journal", "reset", "--force"], 0)
184+
185+
# Run cephfs-data-scan targeting only orphans
180186
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
181187
self.fs.data_scan([
182188
"scan_inodes",
183189
"--filter-tag", tag,
184190
self.fs.get_data_pool_name()
185191
])
192+
self.fs.data_scan(["scan_links"])
186193

187194
# After in-place injection stats should be kosher again
188195
self.fs.set_ceph_conf('mds', 'mds verify scatter', True)

src/common/options/mds.yaml.in

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,40 @@ options:
952952
- mds
953953
fmt_desc: The debug subtree invariants (for developers only).
954954
with_legacy: true
955+
- name: mds_abort_on_newly_corrupt_dentry
956+
type: bool
957+
level: advanced
958+
default: true
959+
services:
960+
- mds
961+
fmt_desc: MDS will abort if dentry is detected newly corrupted.
962+
- name: mds_go_bad_corrupt_dentry
963+
type: bool
964+
level: advanced
965+
default: true
966+
services:
967+
- mds
968+
fmt_desc: MDS will mark a corrupt dentry as bad and isolate
969+
flags:
970+
- runtime
971+
- name: mds_inject_rename_corrupt_dentry_first
972+
type: float
973+
level: dev
974+
default: 0.0
975+
services:
976+
- mds
977+
fmt_desc: probabilistically inject corrupt CDentry::first at rename
978+
flags:
979+
- runtime
980+
- name: mds_inject_journal_corrupt_dentry_first
981+
type: float
982+
level: dev
983+
default: 0.0
984+
services:
985+
- mds
986+
fmt_desc: probabilistically inject corrupt CDentry::first at journal load
987+
flags:
988+
- runtime
955989
- name: mds_kill_mdstable_at
956990
type: int
957991
level: dev

src/mds/CDentry.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "CDentry.h"
1818
#include "CInode.h"
1919
#include "CDir.h"
20+
#include "SnapClient.h"
2021

2122
#include "MDSRank.h"
2223
#include "MDCache.h"
@@ -697,4 +698,27 @@ bool CDentry::scrub(snapid_t next_seq)
697698
return false;
698699
}
699700

701+
bool CDentry::check_corruption(bool load)
702+
{
703+
auto&& snapclient = dir->mdcache->mds->snapclient;
704+
auto next_snap = snapclient->get_last_seq()+1;
705+
if (first > last || (snapclient->is_server_ready() && first > next_snap)) {
706+
if (load) {
707+
dout(1) << "loaded already corrupt dentry: " << *this << dendl;
708+
corrupt_first_loaded = true;
709+
} else {
710+
derr << "newly corrupt dentry to be committed: " << *this << dendl;
711+
}
712+
if (g_conf().get_val<bool>("mds_go_bad_corrupt_dentry")) {
713+
dir->go_bad_dentry(last, get_name());
714+
}
715+
if (!load && g_conf().get_val<bool>("mds_abort_on_newly_corrupt_dentry")) {
716+
dir->mdcache->mds->clog->error() << "MDS abort because newly corrupt dentry to be committed: " << *this;
717+
ceph_abort("detected newly corrupt dentry"); /* avoid writing out newly corrupted dn */
718+
}
719+
return true;
720+
}
721+
return false;
722+
}
723+
700724
MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);

src/mds/CDentry.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry>
160160
return dentry_key_t(last, name.c_str(), hash);
161161
}
162162

163+
bool check_corruption(bool load);
164+
163165
const CDir *get_dir() const { return dir; }
164166
CDir *get_dir() { return dir; }
165167
std::string_view get_name() const { return std::string_view(name); }
@@ -367,6 +369,7 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry>
367369

368370
__u32 hash;
369371
snapid_t first, last;
372+
bool corrupt_first_loaded = false; /* for Postgres corruption detection */
370373

371374
elist<CDentry*>::item item_dirty, item_dir_dirty;
372375
elist<CDentry*>::item item_stray;

0 commit comments

Comments
 (0)