Skip to content

Commit 18c7799

Browse files
Merge pull request ceph#56066 from rishabh-d-dave/mds-fail-confirm
mon,cephfs: require confirmation flag to bring down unhealthy MDS Reviewed-by: Leonid Usov <[email protected]> Reviewed-by: Patrick Donnelly <[email protected]>
2 parents 6f3d652 + 214d614 commit 18c7799

File tree

11 files changed

+280
-7
lines changed

11 files changed

+280
-7
lines changed

PendingReleaseNotes

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
178178
and it can be disabled by using:
179179
`ceph config set mgr mgr/volumes/snapshot_clone_no_wait false`
180180

181+
* CephFS: Command "ceph mds fail" and "ceph fs fail" now requires a
182+
confirmation flag when some MDSs exhibit health warning MDS_TRIM or
183+
MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
184+
further delays in recovery.
185+
181186
>=18.0.0
182187

183188
* The RGW policy parser now rejects unknown principals by default. If you are

doc/cephfs/add-remove-mds.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,11 @@ the following method.
117117

118118
$ sudo rm -rf /var/lib/ceph/mds/ceph-${id}
119119

120+
121+
.. note:: When an active MDS either has health warning MDS_TRIM or
122+
MDS_CACHE_OVERSIZED, confirmation flag (--yes-i-really-mean-it)
123+
needs to be passed, else the command will fail. It is not recommended to
124+
restart an MDS which has these warnings since slow recovery at restart may
125+
lead to more problems.
126+
120127
.. _MDS Config Reference: ../mds-config-ref

doc/cephfs/administration.rst

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,11 @@ file system and MDS daemons down, use the ``ceph fs fail`` command:
193193

194194
::
195195

196-
ceph fs fail <fs_name>
196+
ceph fs fail <fs_name> {--yes-i-really-mean-it}
197+
198+
.. note:: Note that confirmation flag is optional because it is only required
199+
when the MDS is active and has health warning MDS_TRIM or
200+
MDS_CACHE_OVERSIZED.
197201

198202
This command sets a file system flag to prevent standbys from
199203
activating on the file system (the ``joinable`` flag).
@@ -210,7 +214,11 @@ respawn as standbys. The file system will be left in a degraded state.
210214
::
211215

212216
# For all ranks, 0-N:
213-
ceph mds fail <fs_name>:<n>
217+
ceph mds fail <fs_name>:<n> {--yes-i-really-mean-it}
218+
219+
.. note:: Note that confirmation flag is optional because it is only required
220+
when the MDS is active and has health warning MDS_TRIM or
221+
MDS_CACHE_OVERSIZED.
214222

215223
Once all ranks are inactive, the file system may also be deleted or left in
216224
this state for other purposes (perhaps disaster recovery).

qa/suites/fs/functional/tasks/admin.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ overrides:
55
lockdep: true
66
log-ignorelist:
77
- missing required features
8+
- \(MDS_CACHE_OVERSIZED\)
9+
- \(MDS_TRIM\)
810
tasks:
911
- cephfs_test_runner:
1012
fail_on_skip: false

qa/tasks/cephfs/filesystem.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,12 @@ def reset(self):
612612
self.run_ceph_cmd("fs", "reset", str(self.name), '--yes-i-really-mean-it')
613613

614614
def fail(self):
615-
self.run_ceph_cmd("fs", "fail", str(self.name))
615+
cmd = ["fs", "fail", str(self.name)]
616+
try:
617+
self.run_ceph_cmd(cmd)
618+
except CommandFailedError:
619+
cmd.append("--yes-i-really-mean-it")
620+
self.run_ceph_cmd(cmd)
616621

617622
def set_flag(self, var, *args):
618623
a = map(lambda x: str(x).lower(), args)
@@ -1181,8 +1186,13 @@ def rank_freeze(self, yes, rank=0):
11811186
def rank_repaired(self, rank):
11821187
self.run_ceph_cmd("mds", "repaired", "{}:{}".format(self.id, rank))
11831188

1184-
def rank_fail(self, rank=0):
1185-
self.run_ceph_cmd("mds", "fail", "{}:{}".format(self.id, rank))
1189+
def rank_fail(self, rank=0, confirm=True):
1190+
cmd = f'mds fail {self.id}:{rank}'
1191+
try:
1192+
self.run_ceph_cmd(args=cmd)
1193+
except CommandFailedError:
1194+
cmd += ' --yes--i-really-mean-it'
1195+
self.run_ceph_cmd(args=cmd)
11861196

11871197
def rank_is_running(self, rank=0, status=None):
11881198
name = self.get_rank(rank=rank, status=status)['name']

qa/tasks/cephfs/test_admin.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,38 @@ def setup_ec_pools(self, n, metadata=True, overwrites=True):
9191
if overwrites:
9292
self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
9393

94+
def _get_unhealthy_mds_id(self, health_report, health_warn):
95+
'''
96+
Return MDS ID for which health warning in "health_warn" has been
97+
generated.
98+
'''
99+
# variable "msg" should hold string something like this -
100+
# 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
101+
# num_segments: 86
102+
msg = health_report['checks'][health_warn]['detail'][0]['message']
103+
mds_id = msg.split('(')[0]
104+
mds_id = mds_id.replace('mds.', '')
105+
return mds_id
106+
107+
def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
108+
tries=10):
109+
errmsg = (f'Expected health warning "{health_warn}" to eventually '
110+
'show up in output of command "ceph health detail". Tried '
111+
f'{tries} times with interval of {sleep} seconds but the '
112+
'health warning didn\'t turn up.')
113+
114+
with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
115+
while proceed():
116+
self.get_ceph_cmd_stdout(
117+
f'tell mds.{active_mds_id} cache status')
118+
119+
health_report = json.loads(self.get_ceph_cmd_stdout(
120+
'health detail --format json'))
121+
122+
if health_warn in health_report['checks']:
123+
return
124+
125+
94126
@classhook('_add_valid_tell')
95127
class TestValidTell(TestAdminCommands):
96128
@classmethod
@@ -2154,3 +2186,166 @@ def test_fs_authorize(self):
21542186
args=(f'fs authorize {self.fs.name} {self.CLIENT_NAME} / '
21552187
f'{wrong_perm}'), retval=self.EXPECTED_ERRNO,
21562188
errmsgs=self.EXPECTED_ERRMSG)
2189+
2190+
2191+
class TestFSFail(TestAdminCommands):
2192+
2193+
MDSS_REQUIRED = 2
2194+
CLIENTS_REQUIRED = 1
2195+
2196+
def test_with_health_warn_oversize_cache(self):
2197+
'''
2198+
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2199+
MDS, command "ceph fs fail" fails without confirmation flag and passes
2200+
when confirmation flag is passed.
2201+
'''
2202+
health_warn = 'MDS_CACHE_OVERSIZED'
2203+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2204+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2205+
active_mds_id = self.fs.get_active_names()[0]
2206+
2207+
self.mount_a.open_n_background('.', 400)
2208+
self.wait_till_health_warn(health_warn, active_mds_id)
2209+
2210+
# actual testing begins now.
2211+
errmsg = 'mds_cache_oversized'
2212+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2213+
retval=1, errmsgs=errmsg)
2214+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2215+
2216+
def test_with_health_warn_trim(self):
2217+
'''
2218+
Test that, when health warning MDS_TRIM is present for an MDS, command
2219+
"ceph fs fail" fails without confirmation flag and passes when
2220+
confirmation flag is passed.
2221+
'''
2222+
health_warn = 'MDS_TRIM'
2223+
# for generating health warning MDS_TRIM
2224+
self.config_set('mds', 'mds_debug_subtrees', 'true')
2225+
# this will really really slow the trimming, so that MDS_TRIM stays
2226+
# for longer.
2227+
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
2228+
self.config_set('mds', 'mds_log_trim_threshold', '1')
2229+
active_mds_id = self.fs.get_active_names()[0]
2230+
2231+
self.mount_a.open_n_background('.', 400)
2232+
self.wait_till_health_warn(health_warn, active_mds_id)
2233+
2234+
# actual testing begins now.
2235+
errmsg = 'mds_trim'
2236+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2237+
retval=1, errmsgs=errmsg)
2238+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2239+
2240+
def test_with_health_warn_with_2_active_MDSs(self):
2241+
'''
2242+
Test that, when a CephFS has 2 active MDSs and one of them have either
2243+
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
2244+
fails without confirmation flag and passes when confirmation flag is
2245+
passed.
2246+
'''
2247+
health_warn = 'MDS_CACHE_OVERSIZED'
2248+
self.fs.set_max_mds(2)
2249+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2250+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2251+
self.fs.wait_for_daemons()
2252+
mds1_id, mds2_id = self.fs.get_active_names()
2253+
2254+
self.mount_a.open_n_background('.', 400)
2255+
# MDS ID for which health warning has been generated.
2256+
self.wait_till_health_warn(health_warn, mds1_id)
2257+
2258+
# actual testing begins now.
2259+
errmsg = 'mds_cache_oversized'
2260+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2261+
retval=1, errmsgs=errmsg)
2262+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2263+
2264+
2265+
class TestMDSFail(TestAdminCommands):
2266+
2267+
MDSS_REQUIRED = 2
2268+
CLIENTS_REQUIRED = 1
2269+
2270+
def test_with_health_warn_oversize_cache(self):
2271+
'''
2272+
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2273+
MDS, command "ceph mds fail" fails without confirmation flag and
2274+
passes when confirmation flag is passed.
2275+
'''
2276+
health_warn = 'MDS_CACHE_OVERSIZED'
2277+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2278+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2279+
active_mds_id = self.fs.get_active_names()[0]
2280+
2281+
self.mount_a.open_n_background('.', 400)
2282+
self.wait_till_health_warn(health_warn, active_mds_id)
2283+
2284+
# actual testing begins now.
2285+
errmsg = 'mds_cache_oversized'
2286+
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
2287+
retval=1, errmsgs=errmsg)
2288+
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
2289+
2290+
def test_with_health_warn_trim(self):
2291+
'''
2292+
Test that, when health warning MDS_TRIM is present for an MDS, command
2293+
"ceph mds fail" fails without confirmation flag and passes when
2294+
confirmation is passed.
2295+
'''
2296+
health_warn = 'MDS_TRIM'
2297+
# for generating health warning MDS_TRIM
2298+
self.config_set('mds', 'mds_debug_subtrees', 'true')
2299+
# this will really really slow the trimming, so that MDS_TRIM stays
2300+
# for longer.
2301+
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
2302+
self.config_set('mds', 'mds_log_trim_threshold', '1')
2303+
active_mds_id = self.fs.get_active_names()[0]
2304+
2305+
self.mount_a.open_n_background('.', 400)
2306+
self.wait_till_health_warn(health_warn, active_mds_id)
2307+
2308+
# actual testing begins now...
2309+
errmsg = 'mds_trim'
2310+
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
2311+
retval=1, errmsgs=errmsg)
2312+
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
2313+
2314+
def test_with_health_warn_with_2_active_MDSs(self):
2315+
'''
2316+
Test when a CephFS has 2 active MDSs and one of them have either
2317+
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
2318+
fails for both MDSs without confirmation flag and passes for both when
2319+
confirmation flag is passed.
2320+
'''
2321+
health_warn = 'MDS_CACHE_OVERSIZED'
2322+
self.fs.set_max_mds(2)
2323+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2324+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2325+
self.fs.wait_for_daemons()
2326+
mds1_id, mds2_id = self.fs.get_active_names()
2327+
2328+
self.mount_a.open_n_background('.', 400)
2329+
self.wait_till_health_warn(health_warn, mds1_id)
2330+
2331+
health_report = json.loads(self.get_ceph_cmd_stdout('health detail '
2332+
'--format json'))
2333+
# MDS ID for which health warning has been generated.
2334+
hw_mds_id = self._get_unhealthy_mds_id(health_report, health_warn)
2335+
if mds1_id == hw_mds_id:
2336+
non_hw_mds_id = mds2_id
2337+
elif mds2_id == hw_mds_id:
2338+
non_hw_mds_id = mds1_id
2339+
else:
2340+
raise RuntimeError('There are only 2 MDSs right now but apparently'
2341+
'health warning was raised for an MDS other '
2342+
'than these two. This is definitely an error.')
2343+
2344+
# actual testing begins now...
2345+
errmsg = 'mds_cache_oversized'
2346+
self.negtest_ceph_cmd(args=f'mds fail {non_hw_mds_id}', retval=1,
2347+
errmsgs=errmsg)
2348+
self.negtest_ceph_cmd(args=f'mds fail {hw_mds_id}', retval=1,
2349+
errmsgs=errmsg)
2350+
self.run_ceph_cmd('mds fail mds1_id --yes-i-really-mean-it')
2351+
self.run_ceph_cmd('mds fail mds2_id --yes-i-really-mean-it')

src/mon/FSCommands.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ class FailHandler : public FileSystemCommandHandler
108108
return -ENOENT;
109109
}
110110

111+
bool confirm = false;
112+
cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
113+
if (!confirm &&
114+
mon->mdsmon()->has_health_warnings({
115+
MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
116+
ss << errmsg_for_unhealthy_mds;
117+
return -EPERM;
118+
}
119+
111120
auto f = [](auto&& fs) {
112121
fs.get_mds_map().set_flag(CEPH_MDSMAP_NOT_JOINABLE);
113122
};

src/mon/FSCommands.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,12 @@ class FileSystemCommandHandler : protected CommandHandler
9191
std::ostream &ss) = 0;
9292
};
9393

94+
95+
static constexpr auto errmsg_for_unhealthy_mds = \
96+
"MDS has one of two health warnings which could extend recovery: "
97+
"MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
98+
"since it might cause unexpected file system unavailability. If "
99+
"you wish to proceed, pass --yes-i-really-mean-it";
100+
101+
94102
#endif

src/mon/MDSMonitor.cc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,6 +1491,23 @@ bool MDSMonitor::prepare_command(MonOpRequestRef op)
14911491
}
14921492
}
14931493

1494+
bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
1495+
{
1496+
for (auto& [gid, health] : pending_daemon_health) {
1497+
for (auto& metric : health.metrics) {
1498+
// metric.type here is the type of health warning. We are only
1499+
// looking for types of health warnings passed to this func member
1500+
// through variable "warnings".
1501+
auto it = std::find(warnings.begin(), warnings.end(), metric.type);
1502+
if (it != warnings.end()) {
1503+
return true;
1504+
}
1505+
}
1506+
}
1507+
1508+
return false;
1509+
}
1510+
14941511
int MDSMonitor::filesystem_command(
14951512
FSMap &fsmap,
14961513
MonOpRequestRef op,
@@ -1528,6 +1545,8 @@ int MDSMonitor::filesystem_command(
15281545
} else if (prefix == "mds fail") {
15291546
string who;
15301547
cmd_getval(cmdmap, "role_or_gid", who);
1548+
bool confirm = false;
1549+
cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
15311550

15321551
MDSMap::mds_info_t failed_info;
15331552
mds_gid_t gid = gid_from_arg(fsmap, who, ss);
@@ -1547,6 +1566,12 @@ int MDSMonitor::filesystem_command(
15471566
return -EPERM;
15481567
}
15491568

1569+
if (!confirm &&
1570+
has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
1571+
ss << errmsg_for_unhealthy_mds;
1572+
return -EPERM;
1573+
}
1574+
15501575
r = fail_mds(fsmap, ss, who, &failed_info);
15511576
if (r < 0 && r == -EAGAIN) {
15521577
mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));

src/mon/MDSMonitor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include <map>
2222
#include <set>
23+
#include <vector>
2324

2425
#include "include/types.h"
2526
#include "PaxosFSMap.h"
@@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
5152
bool preprocess_query(MonOpRequestRef op) override; // true if processed.
5253
bool prepare_update(MonOpRequestRef op) override;
5354
bool should_propose(double& delay) override;
55+
bool has_health_warnings(std::vector<mds_metric_t> warnings);
5456

5557
bool should_print_status() const {
5658
auto& fs = get_fsmap();

0 commit comments

Comments
 (0)