ArbitCode
diff --git a/‎qa/suites/fs/functional/tasks/damage.yaml‎
Lines changed: 1 addition & 0 deletions b/‎qa/suites/fs/functional/tasks/damage.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qa/tasks/ceph_manager.py‎
Lines changed: 15 additions & 6 deletions b/‎qa/tasks/ceph_manager.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎qa/tasks/cephfs/filesystem.py‎
Lines changed: 6 additions & 3 deletions b/‎qa/tasks/cephfs/filesystem.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎qa/tasks/cephfs/test_damage.py‎
Lines changed: 97 additions & 0 deletions b/‎qa/tasks/cephfs/test_damage.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎qa/tasks/cephfs/test_data_scan.py‎
Lines changed: 1 addition & 0 deletions b/‎qa/tasks/cephfs/test_data_scan.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qa/tasks/cephfs/test_forward_scrub.py‎
Lines changed: 10 additions & 3 deletions b/‎qa/tasks/cephfs/test_forward_scrub.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/common/options/mds.yaml.in‎
Lines changed: 34 additions & 0 deletions b/‎src/common/options/mds.yaml.in‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/mds/CDentry.cc‎
Lines changed: 24 additions & 0 deletions b/‎src/mds/CDentry.cc‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/mds/CDentry.h‎
Lines changed: 3 additions & 0 deletions b/‎src/mds/CDentry.h‎
Lines changed: 3 additions & 0 deletions
@@ -18,6 +18,7 @@ overrides:
       - Metadata damage detected
       - MDS_READ_ONLY
       - force file system read-only
+      - with standby daemon mds
 tasks:
   - cephfs_test_runner:
       modules:
 
@@ -1564,19 +1564,28 @@ def run_cluster_cmd(self, **kwargs):
         elif isinstance(kwargs['args'], tuple):
             kwargs['args'] = list(kwargs['args'])
 
+        prefixcmd = []
+        timeoutcmd = kwargs.pop('timeoutcmd', None)
+        if timeoutcmd is not None:
+            prefixcmd += ['timeout', str(timeoutcmd)]
+
         if self.cephadm:
+            prefixcmd += ['ceph']
+            cmd = prefixcmd + list(kwargs['args'])
             return shell(self.ctx, self.cluster, self.controller,
-                         args=['ceph'] + list(kwargs['args']),
+                         args=cmd,
                          stdout=StringIO(),
                          check_status=kwargs.get('check_status', True))
-        if self.rook:
+        elif self.rook:
+            prefixcmd += ['ceph']
+            cmd = prefixcmd + list(kwargs['args'])
             return toolbox(self.ctx, self.cluster,
-                           args=['ceph'] + list(kwargs['args']),
+                           args=cmd,
                            stdout=StringIO(),
                            check_status=kwargs.get('check_status', True))
-
-        kwargs['args'] = self.CEPH_CMD + kwargs['args']
-        return self.controller.run(**kwargs)
+        else:
+            kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
+            return self.controller.run(**kwargs)
 
     def raw_cluster_cmd(self, *args, **kwargs) -> str:
         """
 
@@ -1249,12 +1249,12 @@ def ranks_perf(self, f, status=None):
             out.append((rank, f(perf)))
         return out
 
-    def read_cache(self, path, depth=None):
+    def read_cache(self, path, depth=None, rank=None):
         cmd = ["dump", "tree", path]
         if depth is not None:
             cmd.append(depth.__str__())
-        result = self.mds_asok(cmd)
-        if len(result) == 0:
+        result = self.rank_asok(cmd, rank=rank)
+        if result is None or len(result) == 0:
             raise RuntimeError("Path not found in cache: {0}".format(path))
 
         return result
@@ -1648,6 +1648,9 @@ def run_scrub(self, cmd, rank=0):
     def get_scrub_status(self, rank=0):
         return self.run_scrub(["status"], rank)
 
+    def flush(self, rank=0):
+        return self.rank_tell(["flush", "journal"], rank=rank)
+
     def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
                                   timeout=300, reverse=False):
         # time out after "timeout" seconds and assume as done
 
@@ -3,6 +3,7 @@
 import logging
 import errno
 import re
+import time
 from teuthology.contextutil import MaxWhileTries
 from teuthology.exceptions import CommandFailedError
 from teuthology.orchestra.run import wait
@@ -562,3 +563,99 @@ def test_open_ino_errors(self):
             self.fs.mon_manager.raw_cluster_cmd(
                 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                 "damage", "rm", str(entry['id']))
+
+    def test_dentry_first_existing(self):
+        """
+        That the MDS won't abort when the dentry is already known to be damaged.
+        """
+
+        def verify_corrupt():
+            info = self.fs.read_cache("/a", 0)
+            log.debug('%s', info)
+            self.assertEqual(len(info), 1)
+            dirfrags = info[0]['dirfrags']
+            self.assertEqual(len(dirfrags), 1)
+            dentries = dirfrags[0]['dentries']
+            self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
+            self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
+
+        self.mount_a.run_shell_payload("mkdir -p a/b")
+        self.fs.flush()
+        self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
+        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        self.mount_a.run_shell_payload("mv a/b a/c; sync .")
+        self.mount_a.umount()
+        verify_corrupt()
+        self.fs.fail()
+        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
+        self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
+        self.fs.set_joinable()
+        status = self.fs.status()
+        self.fs.flush()
+        self.assertFalse(self.fs.status().hadfailover(status))
+        verify_corrupt()
+
+    def test_dentry_first_preflush(self):
+        """
+        That the MDS won't write a dentry with new damage to CDentry::first
+        to the journal.
+        """
+
+        rank0 = self.fs.get_rank()
+        self.fs.rank_freeze(True, rank=0)
+        self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
+        self.fs.flush()
+        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
+        self.fs.rank_freeze(False, rank=0)
+        self.delete_mds_coredump(rank0['name'])
+        self.fs.mds_restart(rank0['name'])
+        self.fs.wait_for_daemons()
+        p.wait()
+        self.mount_a.run_shell_payload("stat a/ && find a/")
+        self.fs.flush()
+
+    def test_dentry_first_precommit(self):
+        """
+        That the MDS won't write a dentry with new damage to CDentry::first
+        to the directory object.
+        """
+
+        fscid = self.fs.id
+        self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
+        self.mount_a.umount() # allow immediate scatter write back
+        self.fs.flush()
+        # now just twiddle some inode metadata on a regular file
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
+        self.mount_a.umount() # avoid journaling session related things
+        # okay, now cause the dentry to get damaged after loading from the journal
+        self.fs.fail()
+        self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
+        time.sleep(5) # for conf to percolate
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        rank0 = self.fs.get_rank()
+        self.fs.rank_freeze(True, rank=0)
+        # so now we want to trigger commit but this will crash, so:
+        c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
+        p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
+        self.fs.rank_freeze(False, rank=0)
+        self.delete_mds_coredump(rank0['name'])
+        self.fs.mds_restart(rank0['name'])
+        self.fs.wait_for_daemons()
+        try:
+            p.wait()
+        except CommandFailedError as e:
+            print(e)
+        else:
+            self.fail("flush journal should fail!")
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell_payload("stat a/ && find a/")
+        self.fs.flush()
@@ -402,6 +402,7 @@ def get_state(mds_id):
         self.fs.data_scan(["init"])
         self.fs.data_scan(["scan_extents"], worker_count=workers)
         self.fs.data_scan(["scan_inodes"], worker_count=workers)
+        self.fs.data_scan(["scan_links"])
 
         # Mark the MDS repaired
         self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
 
@@ -129,7 +129,7 @@ def test_orphan_scan(self):
         # Umount before flush to avoid cap releases putting
         # things we don't want in the journal later.
         self.mount_a.umount_wait()
-        self.fs.mds_asok(["flush", "journal"])
+        self.fs.flush()
 
         # Create a new inode that's just in the log, i.e. would
         # look orphaned to backward scan if backward scan wisnae
@@ -163,7 +163,7 @@ def test_orphan_scan(self):
 
         # Run a tagging forward scrub
         tag = "mytag123"
-        self.fs.mds_asok(["tag", "path", "/parent", tag])
+        self.fs.rank_asok(["tag", "path", "/parent", tag])
 
         # See that the orphan wisnae tagged
         self.assertUntagged(inos['./parent/flushed/bravo'])
@@ -175,14 +175,21 @@ def test_orphan_scan(self):
         # See that journalled-but-not-flushed file *was* tagged
         self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
 
-        # Run cephfs-data-scan targeting only orphans
+        # okay, now we are going to run cephfs-data-scan. It's necessary to
+        # have a clean journal otherwise replay will blowup on mismatched
+        # inotable versions (due to scan_links)
+        self.fs.flush()
         self.fs.fail()
+        self.fs.journal_tool(["journal", "reset", "--force"], 0)
+
+        # Run cephfs-data-scan targeting only orphans
         self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
         self.fs.data_scan([
             "scan_inodes",
             "--filter-tag", tag,
             self.fs.get_data_pool_name()
         ])
+        self.fs.data_scan(["scan_links"])
 
         # After in-place injection stats should be kosher again
         self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
 
@@ -952,6 +952,40 @@ options:
   - mds
   fmt_desc: The debug subtree invariants (for developers only).
   with_legacy: true
+- name: mds_abort_on_newly_corrupt_dentry
+  type: bool
+  level: advanced
+  default: true
+  services:
+  - mds
+  fmt_desc: MDS will abort if dentry is detected newly corrupted.
+- name: mds_go_bad_corrupt_dentry
+  type: bool
+  level: advanced
+  default: true
+  services:
+  - mds
+  fmt_desc: MDS will mark a corrupt dentry as bad and isolate
+  flags:
+  - runtime
+- name: mds_inject_rename_corrupt_dentry_first
+  type: float
+  level: dev
+  default: 0.0
+  services:
+  - mds
+  fmt_desc: probabilistically inject corrupt CDentry::first at rename
+  flags:
+  - runtime
+- name: mds_inject_journal_corrupt_dentry_first
+  type: float
+  level: dev
+  default: 0.0
+  services:
+  - mds
+  fmt_desc: probabilistically inject corrupt CDentry::first at journal load
+  flags:
+  - runtime
 - name: mds_kill_mdstable_at
   type: int
   level: dev
 
@@ -17,6 +17,7 @@
 #include "CDentry.h"
 #include "CInode.h"
 #include "CDir.h"
+#include "SnapClient.h"
 
 #include "MDSRank.h"
 #include "MDCache.h"
@@ -697,4 +698,27 @@ bool CDentry::scrub(snapid_t next_seq)
   return false;
 }
 
+bool CDentry::check_corruption(bool load)
+{
+  auto&& snapclient = dir->mdcache->mds->snapclient;
+  auto next_snap = snapclient->get_last_seq()+1;
+  if (first > last || (snapclient->is_server_ready() && first > next_snap)) {
+    if (load) {
+      dout(1) << "loaded already corrupt dentry: " << *this << dendl;
+      corrupt_first_loaded = true;
+    } else {
+      derr << "newly corrupt dentry to be committed: " << *this << dendl;
+    }
+    if (g_conf().get_val<bool>("mds_go_bad_corrupt_dentry")) {
+      dir->go_bad_dentry(last, get_name());
+    }
+    if (!load && g_conf().get_val<bool>("mds_abort_on_newly_corrupt_dentry")) {
+      dir->mdcache->mds->clog->error() << "MDS abort because newly corrupt dentry to be committed: " << *this;
+      ceph_abort("detected newly corrupt dentry"); /* avoid writing out newly corrupted dn */
+    }
+    return true;
+  }
+  return false;
+}
+
 MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);
@@ -160,6 +160,8 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry>
     return dentry_key_t(last, name.c_str(), hash);
   }
 
+  bool check_corruption(bool load);
+
   const CDir *get_dir() const { return dir; }
   CDir *get_dir() { return dir; }
   std::string_view get_name() const { return std::string_view(name); }
@@ -367,6 +369,7 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry>
 
   __u32 hash;
   snapid_t first, last;
+  bool corrupt_first_loaded = false; /* for Postgres corruption detection */
 
   elist<CDentry*>::item item_dirty, item_dir_dirty;
   elist<CDentry*>::item item_stray;