Skip to content

Commit 8978b85

Browse files
Merge pull request ceph#59420 from rishabh-d-dave/max-mds-confirm
mon,cephfs: require confirmation when changing max_mds on unhealthy cluster Reviewed-by: Patrick Donnelly <[email protected]> Reviewed-by: Venky Shankar <[email protected]>
2 parents 67b374c + a71c8e8 commit 8978b85

File tree

8 files changed

+103
-4
lines changed

8 files changed

+103
-4
lines changed

PendingReleaseNotes

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@
2626
- osd_op_num_threads_per_shard_hdd = 5 (was 1)
2727
For more details see https://tracker.ceph.com/issues/66289.
2828

29+
* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
30+
unhealthy now requires users to pass the confirmation flag
31+
(--yes-i-really-mean-it). This has been added as a precaution to tell the
32+
users that modifying "max_mds" may not help with troubleshooting or recovery
33+
effort. Instead, it might further destabilize the cluster.
34+
35+
36+
2937
>=19.0.0
3038

3139
* cephx: key rotation is now possible using `ceph auth rotate`. Previously,

doc/cephfs/administration.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,17 @@ is a subset of the same information from the ``ceph fs dump`` command.
6161

6262
::
6363

64-
ceph fs set <file system name> <var> <val>
64+
ceph fs set <file system name> <var> <val> [--yes-i-really-mean-it]
6565

6666
Change a setting on a file system. These settings are specific to the named
67-
file system and do not affect other file systems.
67+
file system and do not affect other file systems. Confirmation flag is only
68+
needed for changing ``max_mds`` when cluster is unhealthy.
69+
70+
.. note:: It is mandatory to pass confirmation flag (--yes--i-really-mean-it)
71+
for modifying FS setting variable ``max_mds`` when cluster is unhealthy.
72+
It has been added a precaution to tell users that modifying ``max_mds``
73+
during troubleshooting or recovery might not help. Instead, it might
74+
further destabilize the cluster.
6875

6976
::
7077

doc/cephfs/troubleshooting.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,11 @@ things to do:
128128
129129
That prevents any clients from establishing new sessions with the MDS.
130130
131+
* **Dont tweak max_mds** Modifying the FS setting variable ``max_mds`` is
132+
sometimes perceived as a good step during troubleshooting or recovery effort.
133+
Instead, doing so might further destabilize the cluster. If ``max_mds`` must
134+
be changed in such circumstances, run the command to change ``max_mds`` with
135+
the confirmation flag (``--yes-i-really-mean-it``)
131136

132137

133138
Expediting MDS journal trim

qa/tasks/cephfs/filesystem.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -640,8 +640,11 @@ def set_down(self, down=True):
640640
def set_joinable(self, joinable=True):
641641
self.set_var("joinable", joinable)
642642

643-
def set_max_mds(self, max_mds):
644-
self.set_var("max_mds", "%d" % max_mds)
643+
def set_max_mds(self, max_mds, confirm=True):
644+
if confirm:
645+
self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it')
646+
else:
647+
self.set_var("max_mds", f"{max_mds}",)
645648

646649
def set_session_timeout(self, timeout):
647650
self.set_var("session_timeout", "%d" % timeout)

qa/tasks/cephfs/test_admin.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2683,3 +2683,60 @@ def test_with_health_warn_with_2_active_MDSs(self):
26832683
errmsgs=health_warn)
26842684
self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it')
26852685
self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it')
2686+
2687+
2688+
class TestFSSetMaxMDS(TestAdminCommands):
2689+
2690+
def test_when_unhealthy_without_confirm(self):
2691+
'''
2692+
Test that command "ceph fs set <fsname> max_mds <num>" without the
2693+
confirmation flag (--yes-i-really-mean-it) fails when cluster is
2694+
unhealthy.
2695+
'''
2696+
self.gen_health_warn_mds_cache_oversized()
2697+
2698+
with self.assertRaises(CommandFailedError) as cfe:
2699+
self.fs.set_max_mds(2, confirm=False)
2700+
self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
2701+
2702+
def test_when_unhealthy_with_confirm(self):
2703+
'''
2704+
Test that command "ceph fs set <fsname> max_mds <num>
2705+
--yes-i-really-mean-it" runs successfully when cluster is unhealthy.
2706+
'''
2707+
self.gen_health_warn_mds_cache_oversized()
2708+
2709+
self.fs.set_max_mds(2, confirm=True)
2710+
self.assertEqual(self.fs.get_var('max_mds'), 2)
2711+
2712+
def test_when_mds_trim_without_confirm(self):
2713+
'''
2714+
Test that command "ceph fs set <fsname> max_mds <num>" without the
2715+
confirmation flag (--yes-i-really-mean-it) fails when cluster has
2716+
MDS_TRIM health warning.
2717+
'''
2718+
self.gen_health_warn_mds_trim()
2719+
2720+
with self.assertRaises(CommandFailedError) as cfe:
2721+
self.fs.set_max_mds(2, confirm=False)
2722+
self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
2723+
2724+
def test_when_mds_trim_when_with_confirm(self):
2725+
'''
2726+
Test that command "ceph fs set <fsname> max_mds <num>
2727+
--yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM
2728+
health warning.
2729+
'''
2730+
self.gen_health_warn_mds_trim()
2731+
2732+
self.fs.set_max_mds(2, confirm=True)
2733+
self.assertEqual(self.fs.get_var('max_mds'), 2)
2734+
2735+
def test_when_healthy_with_confirm(self):
2736+
'''
2737+
Test that command "ceph fs set <fsname> max_mds <num>
2738+
--yes-i-really-mean-it" runs successfully also when cluster is
2739+
healthy.
2740+
'''
2741+
self.fs.set_max_mds(2, confirm=True)
2742+
self.assertEqual(self.fs.get_var('max_mds'), 2)

src/mon/FSCommands.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,17 @@ class SetHandler : public FileSystemCommandHandler
385385
return -EINVAL;
386386
}
387387

388+
bool confirm = false;
389+
cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
390+
if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
391+
ss << "One or more file system health warnings are present. Modifying "
392+
<< "the file system setting variable \"max_mds\" may not help "
393+
<< "troubleshoot or recover from these warnings and may further "
394+
<< "destabilize the system. If you really wish to proceed, run "
395+
<< "again with --yes-i-really-mean-it";
396+
return -EPERM;
397+
}
398+
388399
return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val);
389400
}
390401
};

src/mon/MDSMonitor.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,6 +1557,13 @@ bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
15571557
return false;
15581558
}
15591559

1560+
bool MDSMonitor::has_any_health_warning()
1561+
{
1562+
return std::any_of(
1563+
pending_daemon_health.begin(), pending_daemon_health.end(),
1564+
[](auto& it) { return !it.second.metrics.empty() ? true : false; });
1565+
}
1566+
15601567
int MDSMonitor::filesystem_command(
15611568
FSMap &fsmap,
15621569
MonOpRequestRef op,

src/mon/MDSMonitor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
5353
bool prepare_update(MonOpRequestRef op) override;
5454
bool should_propose(double& delay) override;
5555
bool has_health_warnings(std::vector<mds_metric_t> warnings);
56+
bool has_any_health_warning();
5657

5758
bool should_print_status() const {
5859
auto& fs = get_fsmap();

0 commit comments

Comments
 (0)