Skip to content

Commit 293b90f

Browse files
committed
qa/tasks/cephfs: test damage to dentry's first is caught
Signed-off-by: Patrick Donnelly <[email protected]>
1 parent 3ba1739 commit 293b90f

File tree

3 files changed

+101
-0
lines changed

3 files changed

+101
-0
lines changed

qa/suites/fs/functional/tasks/damage.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ overrides:
1818
- Metadata damage detected
1919
- MDS_READ_ONLY
2020
- force file system read-only
21+
- with standby daemon mds
2122
tasks:
2223
- cephfs_test_runner:
2324
modules:

qa/tasks/cephfs/filesystem.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,6 +1648,9 @@ def run_scrub(self, cmd, rank=0):
16481648
def get_scrub_status(self, rank=0):
16491649
return self.run_scrub(["status"], rank)
16501650

1651+
def flush(self, rank=0):
1652+
return self.rank_tell(["flush", "journal"], rank=rank)
1653+
16511654
def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
16521655
timeout=300, reverse=False):
16531656
# time out after "timeout" seconds and assume as done

qa/tasks/cephfs/test_damage.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import errno
55
import re
6+
import time
67
from teuthology.contextutil import MaxWhileTries
78
from teuthology.exceptions import CommandFailedError
89
from teuthology.orchestra.run import wait
@@ -562,3 +563,99 @@ def test_open_ino_errors(self):
562563
self.fs.mon_manager.raw_cluster_cmd(
563564
'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
564565
"damage", "rm", str(entry['id']))
566+
567+
def test_dentry_first_existing(self):
568+
"""
569+
That the MDS won't abort when the dentry is already known to be damaged.
570+
"""
571+
572+
def verify_corrupt():
573+
info = self.fs.read_cache("/a", 0)
574+
log.debug('%s', info)
575+
self.assertEqual(len(info), 1)
576+
dirfrags = info[0]['dirfrags']
577+
self.assertEqual(len(dirfrags), 1)
578+
dentries = dirfrags[0]['dentries']
579+
self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
580+
self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
581+
582+
self.mount_a.run_shell_payload("mkdir -p a/b")
583+
self.fs.flush()
584+
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
585+
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
586+
time.sleep(5) # for conf to percolate
587+
self.mount_a.run_shell_payload("mv a/b a/c; sync .")
588+
self.mount_a.umount()
589+
verify_corrupt()
590+
self.fs.fail()
591+
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
592+
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
593+
self.fs.set_joinable()
594+
status = self.fs.status()
595+
self.fs.flush()
596+
self.assertFalse(self.fs.status().hadfailover(status))
597+
verify_corrupt()
598+
599+
def test_dentry_first_preflush(self):
600+
"""
601+
That the MDS won't write a dentry with new damage to CDentry::first
602+
to the journal.
603+
"""
604+
605+
rank0 = self.fs.get_rank()
606+
self.fs.rank_freeze(True, rank=0)
607+
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
608+
self.fs.flush()
609+
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
610+
time.sleep(5) # for conf to percolate
611+
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
612+
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
613+
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
614+
self.fs.rank_freeze(False, rank=0)
615+
self.delete_mds_coredump(rank0['name'])
616+
self.fs.mds_restart(rank0['name'])
617+
self.fs.wait_for_daemons()
618+
p.wait()
619+
self.mount_a.run_shell_payload("stat a/ && find a/")
620+
self.fs.flush()
621+
622+
def test_dentry_first_precommit(self):
623+
"""
624+
That the MDS won't write a dentry with new damage to CDentry::first
625+
to the directory object.
626+
"""
627+
628+
fscid = self.fs.id
629+
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
630+
self.mount_a.umount() # allow immediate scatter write back
631+
self.fs.flush()
632+
# now just twiddle some inode metadata on a regular file
633+
self.mount_a.mount_wait()
634+
self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
635+
self.mount_a.umount() # avoid journaling session related things
636+
# okay, now cause the dentry to get damaged after loading from the journal
637+
self.fs.fail()
638+
self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
639+
time.sleep(5) # for conf to percolate
640+
self.fs.set_joinable()
641+
self.fs.wait_for_daemons()
642+
rank0 = self.fs.get_rank()
643+
self.fs.rank_freeze(True, rank=0)
644+
# so now we want to trigger commit but this will crash, so:
645+
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
646+
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
647+
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
648+
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
649+
self.fs.rank_freeze(False, rank=0)
650+
self.delete_mds_coredump(rank0['name'])
651+
self.fs.mds_restart(rank0['name'])
652+
self.fs.wait_for_daemons()
653+
try:
654+
p.wait()
655+
except CommandFailedError as e:
656+
print(e)
657+
else:
658+
self.fail("flush journal should fail!")
659+
self.mount_a.mount_wait()
660+
self.mount_a.run_shell_payload("stat a/ && find a/")
661+
self.fs.flush()

0 commit comments

Comments
 (0)