Merge PR ceph#55758 into main

vshankar · vshankar · commit 75bcfd1bbf6c · 2024-05-29T15:04:58.000+05:30
* refs/pull/55758/head:
	doc: update 'journal reset' command with --yes-i-really-really-mean-it
	qa: fix cephfs-journal-tool command options and make fs inactive
	cephfs-journal-tool: Add warning messages during 'journal reset' and prevent execution on active fs

Reviewed-by: Dhairya Parmar &lt;dparmar@redhat.com&gt;
Reviewed-by: Venky Shankar &lt;vshankar@redhat.com&gt;
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
@@ -201,6 +201,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
   and the new feature bit for more information.
 
 * cls_cxx_gather is marked as deprecated.
+* CephFS: cephfs-journal-tool is guarded against running on an online file system.
+  The 'cephfs-journal-tool --rank <fs_name>:<mds_rank> journal reset' and
+  'cephfs-journal-tool --rank <fs_name>:<mds_rank> journal reset --force'
+  commands require '--yes-i-really-really-mean-it'.
 
 * Dashboard: Rearranged Navigation Layout: The navigation layout has been reorganized
   for improved usability and easier access to key features.
diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst
@@ -15,7 +15,8 @@ examining, modifying, and extracting data from journals.
 
     This tool is **dangerous** because it directly modifies internal
     data structures of the file system.  Make backups, be careful, and
-    seek expert advice.  If you are unsure, do not run this tool.
+    seek expert advice.  If you are unsure, do not run this tool. As a
+    precaution, cephfs-journal-tool doesn't work on an active filesystem.
 
 Syntax
 ------
diff --git a/doc/cephfs/disaster-recovery-experts.rst b/doc/cephfs/disaster-recovery-experts.rst
@@ -68,9 +68,9 @@ truncate it like so:
 
 ::
 
-    cephfs-journal-tool [--rank=N] journal reset
+    cephfs-journal-tool [--rank=<fs_name>:{mds-rank|all}] journal reset --yes-i-really-really-mean-it
 
-Specify the MDS rank using the ``--rank`` option when the file system has/had
+Specify the filesystem and the MDS rank using the ``--rank`` option when the file system has/had
 multiple active MDS.
 
 .. warning::
@@ -135,7 +135,7 @@ objects.
     # InoTable
     cephfs-table-tool 0 reset inode
     # Journal
-    cephfs-journal-tool --rank=0 journal reset
+    cephfs-journal-tool --rank=<fs_name>:0 journal reset --yes-i-really-really-mean-it
     # Root inodes ("/" and MDS directory)
     cephfs-data-scan init
 
@@ -253,7 +253,7 @@ Next, we will create the intial metadata for the fs:
     cephfs-table-tool cephfs_recovery:0 reset session
     cephfs-table-tool cephfs_recovery:0 reset snap
     cephfs-table-tool cephfs_recovery:0 reset inode
-    cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force
+    cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force --yes-i-really-really-mean-it
 
 Now perform the recovery of the metadata pool from the data pool:
 
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py
@@ -498,7 +498,7 @@ def test_open_ino_errors(self):
 
         # Drop everything from the MDS cache
         self.fs.fail()
-        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--yes-i-really-really-mean-it'], 0)
         self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py
@@ -447,9 +447,9 @@ def get_state(mds_id):
         if False:
             with self.assertRaises(CommandFailedError):
                 # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"], 0)
+                self.fs.journal_tool(["journal", "reset", "--yes-i-really-really-mean-it"], 0)
 
-        self.fs.journal_tool(["journal", "reset", "--force"], 0)
+        self.fs.journal_tool(["journal", "reset", "--force", "--yes-i-really-really-mean-it"], 0)
         self.fs.data_scan(["init"])
         self.fs.data_scan(["scan_extents"], worker_count=workers)
         self.fs.data_scan(["scan_inodes"], worker_count=workers)
diff --git a/qa/tasks/cephfs/test_flush.py b/qa/tasks/cephfs/test_flush.py
@@ -3,7 +3,6 @@
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
 
-
 class TestFlush(CephFSTestCase):
     def test_flush(self):
         self.mount_a.run_shell(["mkdir", "mydir"])
@@ -44,7 +43,10 @@ def test_flush(self):
 
         # ...and the journal is truncated to just a single subtreemap from the
         # newly created segment
+        self.fs.fail()
         summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
         try:
             self.assertEqual(summary_output,
                              dedent(
@@ -72,6 +74,8 @@ def test_flush(self):
                              ).strip())
             flush_data = self.fs.mds_asok(["flush", "journal"])
             self.assertEqual(flush_data['return_code'], 0)
+
+            self.fs.fail()
             self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
                              dedent(
                                  """
@@ -80,6 +84,8 @@ def test_flush(self):
                                  Errors: 0
                                  """
                              ).strip())
+            self.fs.set_joinable()
+            self.fs.wait_for_daemons()
 
         # Now for deletion!
         # We will count the RADOS deletions and MDS file purges, to verify that
diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py
@@ -184,7 +184,7 @@ def test_orphan_scan(self):
         # inotable versions (due to scan_links)
         self.fs.flush()
         self.fs.fail()
-        self.fs.journal_tool(["journal", "reset", "--force"], 0)
+        self.fs.journal_tool(["journal", "reset", "--force", "--yes-i-really-really-mean-it"], 0)
 
         # Run cephfs-data-scan targeting only orphans
         self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
@@ -411,7 +411,7 @@ def test_health_status_after_dirfrag_repair(self):
 
         self.fs.radosm(["rm", "{0:x}.00000000".format(dir_ino)])
 
-        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--yes-i-really-really-mean-it'], 0)
         self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
diff --git a/qa/tasks/cephfs/test_journal_migration.py b/qa/tasks/cephfs/test_journal_migration.py
@@ -67,6 +67,7 @@ def test_journal_migration(self):
             ))
 
         # Verify that cephfs-journal-tool can now read the rewritten journal
+        self.fs.fail()
         inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
         if not inspect_out.endswith(": OK"):
             raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
@@ -84,6 +85,8 @@ def test_journal_migration(self):
         if event_count < 1000:
             # Approximate value of "lots", expected from having run fsstress
             raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
 
         # Do some client work to check that writing the log is still working
         with self.mount_a.mounted_wait():
diff --git a/qa/tasks/cephfs/test_journal_repair.py b/qa/tasks/cephfs/test_journal_repair.py
@@ -86,7 +86,7 @@ def test_inject_to_empty(self):
 
         # Now check the MDS can read what we wrote: truncate the journal
         # and start the mds.
-        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--yes-i-really-really-mean-it'], 0)
         self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
@@ -231,7 +231,7 @@ def is_marked_damaged():
         self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
         self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
         self.fs.table_tool(["0", "reset", "session"])
-        self.fs.journal_tool(["journal", "reset"], 0)
+        self.fs.journal_tool(["journal", "reset", "--yes-i-really-really-mean-it"], 0)
         self.fs.erase_mds_objects(1)
         self.run_ceph_cmd('fs', 'reset', self.fs.name,
                           '--yes-i-really-mean-it')
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py
@@ -138,7 +138,7 @@ def _rebuild_metadata(self, workload, other_pool=None, workers=1):
         if False:
             with self.assertRaises(CommandFailedError):
                 # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"], 0)
+                self.fs.journal_tool(["journal", "reset", "--yes-i-really-really-mean-it"], 0)
 
         recovery_fs.data_scan(['scan_extents', '--alternate-pool',
                            recovery_pool, '--filesystem', self.fs.name,
@@ -150,7 +150,7 @@ def _rebuild_metadata(self, workload, other_pool=None, workers=1):
         recovery_fs.data_scan(['scan_links', '--filesystem', recovery_fs.name])
         recovery_fs.journal_tool(['event', 'recover_dentries', 'list',
                               '--alternate-pool', recovery_pool], 0)
-        recovery_fs.journal_tool(["journal", "reset", "--force"], 0)
+        recovery_fs.journal_tool(["journal", "reset", "--force", "--yes-i-really-really-mean-it"], 0)
 
         # Start the MDS
         recovery_fs.set_joinable()
diff --git a/qa/workunits/fs/damage/test-first-damage.sh b/qa/workunits/fs/damage/test-first-damage.sh
@@ -84,7 +84,7 @@ function recover {
   ceph fs fail "$FS"
   sleep 5
   cephfs-journal-tool --rank="$FS":0 event recover_dentries summary
-  cephfs-journal-tool --rank="$FS":0 journal reset
+  cephfs-journal-tool --rank="$FS":0 journal reset --yes-i-really-really-mean-it
   "$PYTHON" $FIRST_DAMAGE --debug /tmp/debug1 --memo /tmp/memo1 "$METADATA_POOL"
   "$PYTHON" $FIRST_DAMAGE --debug /tmp/debug2 --memo /tmp/memo2 --repair-nosnap  "$METADATA_POOL"
   "$PYTHON" $FIRST_DAMAGE --debug /tmp/debug3 --memo /tmp/memo3 --remove "$METADATA_POOL"
diff --git a/qa/workunits/suites/cephfs_journal_tool_smoke.sh b/qa/workunits/suites/cephfs_journal_tool_smoke.sh
@@ -50,7 +50,7 @@ if [ ! -s $JOURNAL_FILE ] ; then
 fi
 
 # Can we execute a journal reset?
-$BIN journal reset
+$BIN journal reset --yes-i-really-really-mean-it
 $BIN journal inspect
 $BIN header get
 
@@ -86,6 +86,6 @@ $BIN event splice summary
 # Metadata objects have been modified by the 'event recover_dentries' command.
 # Journal is no long consistent with respect to metadata objects (especially inotable).
 # To ensure mds successfully replays its journal, we need to do journal reset.
-$BIN journal reset
+$BIN journal reset --yes-i-really-really-mean-it
 cephfs-table-tool all reset session
 
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
@@ -47,7 +47,7 @@ void JournalTool::usage()
     << "      inspect\n"
     << "      import <path> [--force]\n"
     << "      export <path>\n"
-    << "      reset [--force]\n"
+    << "      reset [--force] <--yes-i-really-really-mean-it>\n"
     << "  cephfs-journal-tool [options] header <get|set> <field> <value>\n"
     << "    <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
     << "  cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
@@ -139,6 +139,12 @@ int JournalTool::main(std::vector<const char*> &argv)
   }
  
   auto& fs = fsmap->get_filesystem(role_selector.get_ns());
+  stringstream (rank_str.substr(rank_str.find(':') + 1)) >> rank;
+  if (fs.get_mds_map().is_active(rank)) {
+    derr << "Cannot run cephfs-journal-tool on an active file system!" << dendl;
+    return -CEPHFS_EPERM;
+  }
+
   int64_t const pool_id = fs.get_mds_map().get_metadata_pool();
   dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
   std::string pool_name;
@@ -196,7 +202,7 @@ int JournalTool::validate_type(const std::string &type)
   if (type == "mdlog" || type == "purge_queue") {
     return 0;
   }
-  return -1;
+  return -CEPHFS_EPERM;
 }
 
 std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
@@ -250,14 +256,36 @@ int JournalTool::main_journal(std::vector<const char*> &argv)
     }
   } else if (command == "reset") {
     bool force = false;
-    if (argv.size() == 2) {
+    if (argv.size() == 1) {
+        std::cerr << "warning: this operation resets the journal!!!\n"
+                  << "Do not run this operation if you do not understand CephFS' internal storage mechanisms or have received specific instructions from those who do.\n"
+                  << "If you want to continue, please add --yes-i-really-really-mean-it!!!"
+                  << std::endl;
+        return -EINVAL;
+    } else if (argv.size() == 2) {
+      if (std::string(argv[1]) == "--force") {
+        std::cerr << "warning: this operation resets the journal!!!\n"
+                  << "Do not run this operation if you do not understand CephFS' internal storage mechanisms or have received specific instructions from those who do.\n"
+                  << "If you want to continue, please add --yes-i-really-really-mean-it!!!"
+                  << std::endl;
+        return -EINVAL;
+      } else if (std::string(argv[1]) != "--yes-i-really-really-mean-it") {
+        std::cerr << "Unknown argument " << argv[1] << std::endl;
+        return -EINVAL;
+      }
+    } else if (argv.size() == 3) {
       if (std::string(argv[1]) == "--force") {
         force = true;
       } else {
         std::cerr << "Unknown argument " << argv[1] << std::endl;
         return -EINVAL;
       }
-    } else if (argv.size() > 2) {
+
+      if (std::string(argv[2]) != "--yes-i-really-really-mean-it") {
+	std::cerr << "Unknown argument " << argv[2] << std::endl;
+        return -EINVAL;
+      }
+    } else if (argv.size() > 3) {
       std::cerr << "Too many arguments!" << std::endl;
       return -EINVAL;
     }
diff --git a/src/tools/cephfs/first-damage.py b/src/tools/cephfs/first-damage.py
@@ -25,7 +25,7 @@
 #
 # 4b) If all good so far, reset the journal:
 #
-#    cephfs-journal-tool --rank=<fs_name>:0 journal reset
+#    cephfs-journal-tool --rank=<fs_name>:0 journal reset --yes-i-really-really-mean-it
 #
 # 5) Run this tool to see list of damaged dentries:
 #