Skip to content

Commit fca07e9

Browse files
authored
Merge pull request ceph#58647 from rishabh-d-dave/mgr-vol-mod-disable
mgr: allow disabling always-on modules
2 parents d10ea1f + 3232f6b commit fca07e9

File tree

9 files changed

+322
-11
lines changed

9 files changed

+322
-11
lines changed

PendingReleaseNotes

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
- osd_op_num_shards_hdd = 1 (was 5)
2626
- osd_op_num_threads_per_shard_hdd = 5 (was 1)
2727
For more details see https://tracker.ceph.com/issues/66289.
28+
* MGR: MGR's always-on modulues/plugins can now be force-disabled. This can be
29+
necessary in cases where MGR(s) needs to be prevented from being flooded by
30+
the module commands when coresponding Ceph service is down/degraded.
2831

2932
* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
3033
unhealthy now requires users to pass the confirmation flag

doc/cephfs/fs-volumes.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,5 +1419,28 @@ set with this id was present in the database
14191419

14201420
$ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0
14211421

1422+
1423+
.. _disabling-volumes-plugin:
1424+
1425+
Disabling Volumes Plugin
1426+
------------------------
1427+
By default the volumes plugin is enabled and set to ``always on``. However, in
1428+
certain cases it might be appropriate to disable it. For example, when a CephFS
1429+
is in a degraded state, the volumes plugin commands may accumulate in MGR
1430+
instead of getting served. Which eventually causes policy throttles to kick in
1431+
and the MGR becomes unresponsive.
1432+
1433+
In this event, volumes plugin can be disabled even though it is an
1434+
``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes
1435+
--yes-i-really-mean-it``. Do note that this command will disable operations
1436+
and remove commands of volumes plugin since it will disable all CephFS
1437+
services on the Ceph cluster accessed through this plugin.
1438+
1439+
Before resorting to a measure as drastic as this, it is a good idea to try less
1440+
drastic measures and then assess if the file system experience has improved due
1441+
to it. One example of such less drastic measure is to disable asynchronous
1442+
threads launched by volumes plugins for cloning and purging trash.
1443+
1444+
14221445
.. _manila: https://github.com/openstack/manila
14231446
.. _CSI: https://github.com/ceph/ceph-csi

doc/cephfs/troubleshooting.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,11 @@ its associated key. A less drastic but half-fix is to change the osd cap for
412412
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
413413
data=....``
414414

415+
Disabling Volumes Plugin
416+
========================
417+
In certain scenarios, volumes plugin might be needed to disabled to prevent compromise
418+
for rest of the Ceph cluster. For details see: :ref:`disabling-volumes-plugin`
419+
415420
Reporting Issues
416421
================
417422

qa/cephfs/overrides/ignorelist_health.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ overrides:
2424
- BLUESTORE_SLOW_OP_ALERT
2525
- slow operation indications in BlueStore
2626
- experiencing slow operations in BlueStore
27+
- MGR_MODULE_ERROR

qa/tasks/cephfs/test_admin.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2740,3 +2740,184 @@ def test_when_healthy_with_confirm(self):
27402740
'''
27412741
self.fs.set_max_mds(2, confirm=True)
27422742
self.assertEqual(self.fs.get_var('max_mds'), 2)
2743+
2744+
2745+
class TestToggleVolumes(CephFSTestCase):
2746+
'''
2747+
Contains code for enabling/disabling mgr/volumes plugin.
2748+
'''
2749+
2750+
VOL_MOD_NAME = 'volumes'
2751+
CONFIRM = '--yes-i-really-mean-it'
2752+
2753+
def tearDown(self):
2754+
'''
2755+
Ensure that the volumes plugin is enabled after the test has finished
2756+
running since not doing so might affect tearDown() of CephFSTestCase or
2757+
other superclasses.
2758+
'''
2759+
json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
2760+
json_output = json.loads(json_output)
2761+
2762+
if 'volumes' in json_output['force_disabled_modules']:
2763+
self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
2764+
2765+
super(TestToggleVolumes, self).tearDown()
2766+
2767+
def test_force_disable_with_confirmation(self):
2768+
'''
2769+
Test that running "ceph mgr module force disable volumes
2770+
--yes-i-really-mean-it" successfully disables volumes plugin.
2771+
2772+
Also test "ceph mgr module ls" output after this.
2773+
'''
2774+
self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
2775+
f'{self.CONFIRM}')
2776+
2777+
json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
2778+
json_output = json.loads(json_output)
2779+
2780+
self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
2781+
self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
2782+
2783+
self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
2784+
self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
2785+
2786+
def test_force_disable_fails_without_confirmation(self):
2787+
'''
2788+
Test that running "ceph mgr module force disable volumes" fails with
2789+
EPERM when confirmation flag is not passed along.
2790+
2791+
Also test that output of this command suggests user to pass
2792+
--yes-i-really-mean-it.
2793+
'''
2794+
proc = self.run_ceph_cmd(
2795+
f'mgr module force disable {self.VOL_MOD_NAME}',
2796+
stderr=StringIO(), check_status=False)
2797+
2798+
self.assertEqual(proc.returncode, errno.EPERM)
2799+
2800+
proc_stderr = proc.stderr.getvalue()
2801+
self.assertIn('EPERM', proc_stderr)
2802+
# ensure that the confirmation flag was recommended
2803+
self.assertIn(self.CONFIRM, proc_stderr)
2804+
2805+
def test_force_disable_idempotency(self):
2806+
'''
2807+
Test that running "ceph mgr module force disable volumes" passes when
2808+
volumes plugin was already force disabled.
2809+
'''
2810+
self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
2811+
f'{self.CONFIRM}')
2812+
sleep(5)
2813+
2814+
json_output = self.get_ceph_cmd_stdout('mgr module ls --format '
2815+
'json-pretty')
2816+
json_output = json.loads(json_output)
2817+
2818+
self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
2819+
self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
2820+
2821+
self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
2822+
self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
2823+
2824+
# XXX: this this test, running this command 2nd time should pass.
2825+
self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}')
2826+
2827+
def test_force_disable_nonexistent_mod(self):
2828+
'''
2829+
Test that passing non-existent name to "ceph mgr module force disable"
2830+
command leads to an error.
2831+
'''
2832+
proc = self.run_ceph_cmd(
2833+
f'mgr module force disable abcd {self.CONFIRM}',
2834+
check_status=False, stderr=StringIO())
2835+
self.assertEqual(proc.returncode, errno.EINVAL)
2836+
self.assertIn('EINVAL', proc.stderr.getvalue())
2837+
2838+
def test_force_disable_non_alwayson_mod(self):
2839+
'''
2840+
Test that passing non-existent name to "ceph mgr module force disable"
2841+
command leads to an error.
2842+
'''
2843+
json_output = self.get_ceph_cmd_stdout(
2844+
'mgr module ls --format json-pretty', check_status=False,
2845+
stderr=StringIO())
2846+
output_dict = json.loads(json_output)
2847+
some_non_alwayson_mod = output_dict['enabled_modules'][0]
2848+
2849+
proc = self.run_ceph_cmd(
2850+
f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}',
2851+
check_status=False, stderr=StringIO())
2852+
self.assertEqual(proc.returncode, errno.EINVAL)
2853+
self.assertIn('EINVAL', proc.stderr.getvalue())
2854+
2855+
def test_enabled_by_default(self):
2856+
'''
2857+
Test that volumes plugin is enabled by default and is also reported as
2858+
"always on".
2859+
'''
2860+
json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
2861+
json_output = json.loads(json_output)
2862+
2863+
self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
2864+
2865+
self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
2866+
self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
2867+
self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
2868+
2869+
def test_disable_fails(self):
2870+
'''
2871+
Test that running "ceph mgr module disable volumes" fails with EPERM.
2872+
2873+
This is expected since volumes is an always-on module and therefore
2874+
it can only be disabled using command "ceph mgr module force disable
2875+
volumes".
2876+
'''
2877+
proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}',
2878+
stderr=StringIO(), check_status=False)
2879+
self.assertEqual(proc.returncode, errno.EPERM)
2880+
2881+
proc_stderr = proc.stderr.getvalue()
2882+
self.assertIn('EPERM', proc_stderr)
2883+
2884+
def test_enable_idempotency(self):
2885+
'''
2886+
Test that enabling volumes plugin when it is already enabled doesn't
2887+
exit with non-zero return value.
2888+
2889+
Also test that it reports plugin as already enabled.
2890+
'''
2891+
proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}',
2892+
stderr=StringIO())
2893+
self.assertEqual(proc.returncode, 0)
2894+
2895+
proc_stderr = proc.stderr.getvalue()
2896+
self.assertIn('already enabled', proc_stderr)
2897+
self.assertIn('always-on', proc_stderr)
2898+
2899+
def test_enable_post_disabling(self):
2900+
'''
2901+
Test that enabling volumes plugin after (force-)disabling it works
2902+
successfully.
2903+
2904+
Alo test "ceph mgr module ls" output for volumes plugin afterwards.
2905+
'''
2906+
self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
2907+
f'{self.CONFIRM}')
2908+
# give bit of time for plugin to be disabled.
2909+
sleep(5)
2910+
2911+
self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
2912+
# give bit of time for plugin to be functional again
2913+
sleep(5)
2914+
json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
2915+
json_output = json.loads(json_output)
2916+
self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
2917+
self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
2918+
self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
2919+
self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
2920+
2921+
# plugin is reported properly by "ceph mgr module ls" command, check if
2922+
# it is also working fine.
2923+
self.run_ceph_cmd('fs volume ls')

src/mgr/PyModuleRegistry.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,8 @@ bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
151151
return false;
152152
} else {
153153
bool modules_changed = mgr_map_.modules != mgr_map.modules ||
154-
mgr_map_.always_on_modules != mgr_map.always_on_modules;
154+
mgr_map_.always_on_modules != mgr_map.always_on_modules ||
155+
mgr_map_.force_disabled_modules != mgr_map.force_disabled_modules;
155156
mgr_map = mgr_map_;
156157

157158
if (standby_modules != nullptr) {
@@ -240,10 +241,20 @@ void PyModuleRegistry::active_start(
240241
// Anything we're skipping because of !can_run will be flagged
241242
// to the user separately via get_health_checks
242243
if (!(i.second->is_enabled() && i.second->is_loaded())) {
244+
dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
245+
<< "not enabled and loaded" << dendl;
243246
continue;
244247
}
245248

246-
dout(4) << "Starting " << i.first << dendl;
249+
// These are always-on modules but user force-disabled them.
250+
if (mgr_map.force_disabled_modules.find(i.first) !=
251+
mgr_map.force_disabled_modules.end()) {
252+
dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
253+
<< "force-disabled" << dendl;
254+
continue;
255+
}
256+
257+
dout(4) << "Starting module '" << i.first << "'" << dendl;
247258
active_modules->start_one(i.second);
248259
}
249260
}

src/mon/MgrMap.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ class MgrMap
297297
// active version.
298298
std::map<uint32_t, std::set<std::string>> always_on_modules;
299299

300+
// Modules which are always-on but have been force-disabled by user.
301+
std::set<std::string> force_disabled_modules;
302+
300303
// Modules which are reported to exist
301304
std::vector<ModuleInfo> available_modules;
302305

@@ -448,7 +451,7 @@ class MgrMap
448451
ENCODE_FINISH(bl);
449452
return;
450453
}
451-
ENCODE_START(13, 6, bl);
454+
ENCODE_START(14, 6, bl);
452455
encode(epoch, bl);
453456
encode(active_addrs, bl, features);
454457
encode(active_gid, bl);
@@ -473,13 +476,14 @@ class MgrMap
473476
encode(clients_addrs, bl, features);
474477
encode(clients_names, bl, features);
475478
encode(flags, bl);
479+
encode(force_disabled_modules, bl);
476480
ENCODE_FINISH(bl);
477481
return;
478482
}
479483

480484
void decode(ceph::buffer::list::const_iterator& p)
481485
{
482-
DECODE_START(13, p);
486+
DECODE_START(14, p);
483487
decode(epoch, p);
484488
decode(active_addrs, p);
485489
decode(active_gid, p);
@@ -549,6 +553,11 @@ class MgrMap
549553
if (struct_v >= 13) {
550554
decode(flags, p);
551555
}
556+
557+
if (struct_v >= 14) {
558+
decode(force_disabled_modules, p);
559+
}
560+
552561
DECODE_FINISH(p);
553562
}
554563

@@ -603,6 +612,13 @@ class MgrMap
603612
f->close_section();
604613
}
605614
f->close_section(); // always_on_modules
615+
616+
f->open_object_section("force_disabled_modules");
617+
for (auto& m : force_disabled_modules) {
618+
f->dump_string("module", m);
619+
}
620+
f->close_section();
621+
606622
f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
607623
f->open_array_section("active_clients");
608624
for (const auto& i : clients) {

0 commit comments

Comments
 (0)