Skip to content

Commit 43d8c21

Browse files
committed
Dev: sbd: Refactor to enable checking when cluster is down
1 parent cb7ee9d commit 43d8c21

File tree

6 files changed

+106
-58
lines changed

6 files changed

+106
-58
lines changed

crmsh/bootstrap.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,14 +2068,9 @@ def adjust_priority_fencing_delay(is_2node_wo_qdevice):
20682068
and the current cluster is 2 nodes without qdevice,
20692069
set priority-fencing-delay=2*pcmk_delay_max
20702070
"""
2071-
out = sh.cluster_shell().get_stdout_or_raise_error("crm configure show related:stonith")
2072-
if not out:
2073-
return
2074-
pcmk_delay_max_v_list = re.findall(r"pcmk_delay_max=(\w+)", out)
2075-
if pcmk_delay_max_v_list:
2076-
max_value = max([int(utils.crm_msec(v)/1000) for v in pcmk_delay_max_v_list])
2077-
if pcmk_delay_max_v_list and is_2node_wo_qdevice:
2078-
utils.set_property("priority-fencing-delay", 2*max_value, conditional=True)
2071+
pcmk_delay_max_value = utils.get_pcmk_delay_max_configured_value()
2072+
if pcmk_delay_max_value > 0 and is_2node_wo_qdevice:
2073+
utils.set_property("priority-fencing-delay", 2*pcmk_delay_max_value, conditional=True)
20792074
else:
20802075
utils.set_property("priority-fencing-delay", 0)
20812076

@@ -2174,10 +2169,9 @@ def remove_node_from_cluster(node, dead_node=False):
21742169
corosync.del_node(node_ip if node_ip is not None else node)
21752170

21762171
corosync.configure_two_node(removing=True)
2177-
adjust_properties()
2178-
21792172
logger.info("Propagating configuration changes across the remaining nodes")
21802173
sync_path(corosync.conf())
2174+
adjust_properties()
21812175

21822176
sh.cluster_shell().get_stdout_or_raise_error("corosync-cfgtool -R")
21832177

@@ -2761,19 +2755,19 @@ def adjust_pcmk_delay_max(is_2node_wo_qdevice):
27612755
logger.info("Delete parameter 'pcmk_delay_max' for resource '{}'".format(res))
27622756

27632757

2764-
def adjust_stonith_timeout(with_sbd: bool = False):
2758+
def adjust_stonith_timeout():
27652759
"""
27662760
Adjust stonith-timeout for sbd and other scenarios
27672761
"""
2768-
if ServiceManager().service_is_active(constants.SBD_SERVICE) or with_sbd:
2769-
sbd.SBDTimeoutChecker(quiet=True, fix=True, from_bootstrap=True).check_and_fix()
2762+
if ServiceManager().service_is_active(constants.SBD_SERVICE):
2763+
sbd.SBDTimeoutChecker(quiet=True, fix=True).check_and_fix()
27702764
else:
27712765
value = get_stonith_timeout_generally_expected()
27722766
if value:
27732767
utils.set_property("stonith-timeout", value, conditional=True)
27742768

27752769

2776-
def adjust_properties(with_sbd: bool = False):
2770+
def adjust_properties():
27772771
"""
27782772
Adjust properties for the cluster:
27792773
- pcmk_delay_max
@@ -2791,7 +2785,7 @@ def adjust_properties(with_sbd: bool = False):
27912785
return
27922786
is_2node_wo_qdevice = utils.is_2node_cluster_without_qdevice()
27932787
adjust_pcmk_delay_max(is_2node_wo_qdevice)
2794-
adjust_stonith_timeout(with_sbd=with_sbd)
2788+
adjust_stonith_timeout()
27952789
adjust_priority_in_rsc_defaults(is_2node_wo_qdevice)
27962790
adjust_priority_fencing_delay(is_2node_wo_qdevice)
27972791

crmsh/sbd.py

Lines changed: 70 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from . import corosync
1212
from . import xmlutil
1313
from . import watchdog
14+
from . import cibquery
1415
from .service_manager import ServiceManager
1516
from .sh import ShellUtils
1617

@@ -176,6 +177,15 @@ def handle_input_sbd_devices(dev_list, dev_list_from_config=None):
176177

177178
return overwrite_list, no_overwrite_list
178179

180+
@staticmethod
181+
def diskbased_sbd_configured() -> bool:
182+
return bool(SBDUtils.get_sbd_device_from_config())
183+
184+
@staticmethod
185+
def diskless_sbd_configured() -> bool:
186+
value = utils.get_property("stonith-watchdog-timeout")
187+
return value and utils.crm_msec(value) > 0
188+
179189

180190
class SBDTimeout(object):
181191
'''
@@ -448,11 +458,10 @@ class CheckResult(Enum):
448458

449459
class SBDTimeoutChecker(SBDTimeout):
450460

451-
def __init__(self, quiet=False, fix=False, from_bootstrap=False):
461+
def __init__(self, quiet=False, fix=False):
452462
super().__init__()
453463
self.quiet = quiet
454464
self.fix = fix
455-
self.from_bootstrap = from_bootstrap
456465
self.peer_node_list = []
457466
self.service_disabled_node_list = []
458467

@@ -465,6 +474,22 @@ def _return_helper(check_res_list: list[CheckResult]) -> CheckResult:
465474
else:
466475
return CheckResult.WARNING
467476

477+
@staticmethod
478+
def log_and_return(check_res: CheckResult, fix_flag: bool = False) -> bool:
479+
if check_res == CheckResult.SUCCESS:
480+
logger.info('SBD: Check sbd timeout configuration: OK.')
481+
return True
482+
cmd = "crm cluster health sbd --fix"
483+
issue_type = "error" if check_res == CheckResult.ERROR else "warning"
484+
if not fix_flag:
485+
logger.info(f'Please run "{cmd}" to fix the above {issue_type} on the running cluster')
486+
if check_res == CheckResult.ERROR:
487+
logger.error("SBD: Check sbd timeout configuration: FAIL.")
488+
return False
489+
elif check_res == CheckResult.WARNING:
490+
logger.info('SBD: Check sbd timeout configuration: OK.')
491+
return True
492+
468493
def check_and_fix(self) -> CheckResult:
469494
checks_and_fixes = [
470495
# issue name, check method, fix method, SSH required, prerequisites checks
@@ -549,8 +574,11 @@ def check_and_fix(self) -> CheckResult:
549574
)
550575
]
551576

552-
if not self.from_bootstrap and not ServiceManager().service_is_active(constants.SBD_SERVICE):
553-
raise FixAborted("%s is not active, skip SBD timeout checks" % constants.SBD_SERVICE)
577+
if not ServiceManager().service_is_active(constants.SBD_SERVICE):
578+
if self.fix:
579+
raise FixAborted("%s is not active, skip fixing SBD timeout issues" % constants.SBD_SERVICE)
580+
elif not SBDUtils.diskbased_sbd_configured() and not SBDUtils.diskless_sbd_configured():
581+
raise FixAborted("Neither disk-based nor disk-less SBD is configured, skip checking SBD timeout issues")
554582

555583
all_nodes_reachable = True
556584
self.peer_node_list = utils.list_cluster_nodes_except_me()
@@ -592,9 +620,6 @@ def check_and_fix(self) -> CheckResult:
592620

593621
def _check_config_consistency(self, error_msg: str = "") -> bool:
594622
consistent = True
595-
# Don't check consistency during bootstrap process
596-
if self.from_bootstrap:
597-
return consistent
598623

599624
if not self.peer_node_list:
600625
if error_msg:
@@ -636,7 +661,7 @@ def _check_sbd_disk_metadata(self) -> CheckResult:
636661
'''
637662
For disk-based SBD, check if the sbd msgwait and watchdog timeout are below expected values
638663
'''
639-
if self.disk_based and not self.from_bootstrap:
664+
if self.disk_based:
640665
self.sbd_watchdog_timeout_expected, self.sbd_msgwait_expected = SBDTimeout.get_sbd_metadata_expected()
641666
if self.sbd_watchdog_timeout < self.sbd_watchdog_timeout_expected:
642667
if not self.quiet:
@@ -661,7 +686,7 @@ def _check_sbd_watchdog_timeout(self) -> CheckResult:
661686
'''
662687
For diskless SBD, check if SBD_WATCHDOG_TIMEOUT is below expected value
663688
'''
664-
if not self.disk_based and not self.from_bootstrap:
689+
if not self.disk_based:
665690
self.sbd_watchdog_timeout_expected = SBDTimeout.get_sbd_watchdog_timeout_expected(diskless=True)
666691
if self.sbd_watchdog_timeout < self.sbd_watchdog_timeout_expected:
667692
if not self.quiet:
@@ -731,22 +756,28 @@ def _fix_sbd_systemd_start_timeout(self):
731756
utils.cluster_run_cmd("systemctl daemon-reload")
732757

733758
def _check_stonith_watchdog_timeout(self) -> CheckResult:
734-
value = utils.get_property("stonith-watchdog-timeout", get_default=False)
759+
value = utils.get_property("stonith-watchdog-timeout")
760+
value = int(utils.crm_msec(value)/1000)
735761
if self.disk_based:
736-
if value:
762+
if value > 0:
737763
if not self.quiet:
738764
logger.warning("It's recommended that stonith-watchdog-timeout is not set when using disk-based SBD")
739765
return CheckResult.WARNING
740766
else:
741-
if not value or int(value) < self.stonith_watchdog_timeout:
767+
if value == 0:
768+
if not self.quiet:
769+
logger.error("It's recommended that stonith-watchdog-timeout is set to %d, now is not set",
770+
self.stonith_watchdog_timeout)
771+
return CheckResult.ERROR
772+
if value < self.stonith_watchdog_timeout:
742773
if not self.quiet:
743-
logger.error("It's recommended that stonith-watchdog-timeout is set to %d, now is %s",
744-
self.stonith_watchdog_timeout, value if value else "not set")
774+
logger.error("It's recommended that stonith-watchdog-timeout is set to %d, now is %d",
775+
self.stonith_watchdog_timeout, value)
745776
return CheckResult.ERROR
746-
elif int(value) > self.stonith_watchdog_timeout:
777+
elif value > self.stonith_watchdog_timeout:
747778
if not self.quiet:
748779
logger.warning("It's recommended that stonith-watchdog-timeout is set to %d, now is %d",
749-
self.stonith_watchdog_timeout, int(value))
780+
self.stonith_watchdog_timeout, value)
750781
return CheckResult.WARNING
751782
return CheckResult.SUCCESS
752783

@@ -760,16 +791,18 @@ def _fix_stonith_watchdog_timeout(self):
760791

761792
def _check_stonith_timeout(self) -> CheckResult:
762793
expected_value = self.get_stonith_timeout_expected()
763-
value = utils.get_property("stonith-timeout", get_default=False)
764-
if not value or int(value) < expected_value:
794+
value = utils.get_property("stonith-timeout")
795+
# will get default value from pacemaker metadata if not set
796+
value = int(utils.crm_msec(value)/1000)
797+
if value < expected_value:
765798
if not self.quiet:
766-
logger.error("It's recommended that stonith-timeout is set to %d, now is %s",
767-
expected_value, value if value else "not set")
799+
logger.error("It's recommended that stonith-timeout is set to %d, now is %d",
800+
expected_value, value)
768801
return CheckResult.ERROR
769-
elif int(value) > expected_value:
802+
elif value > expected_value:
770803
if not self.quiet:
771804
logger.warning("It's recommended that stonith-timeout is set to %d, now is %d",
772-
expected_value, int(value))
805+
expected_value, value)
773806
return CheckResult.WARNING
774807
return CheckResult.SUCCESS
775808

@@ -822,14 +855,24 @@ def _fix_sbd_service_is_enabled(self):
822855
service_manager.enable_service(constants.SBD_SERVICE, node)
823856

824857
def _check_fence_sbd(self) -> CheckResult:
825-
if not self.disk_based or self.from_bootstrap:
858+
if not self.disk_based:
826859
return CheckResult.SUCCESS
827860
xml_inst = xmlutil.CrmMonXmlParser()
861+
if xml_inst.not_connected():
862+
cib = xmlutil.text2elem(sh.cluster_shell().get_stdout_or_raise_error("crm configure show xml"))
863+
ra = cibquery.ResourceAgent("stonith", "", "fence_sbd")
864+
configured = cibquery.get_primitives_with_ra(cib, ra)
865+
if configured:
866+
return CheckResult.SUCCESS
867+
else:
868+
if not self.quiet:
869+
logger.error("Fence agent %s is not configured", SBDManager.SBD_RA)
870+
return CheckResult.ERROR
828871
if not xml_inst.is_resource_configured(SBDManager.SBD_RA):
829872
if not self.quiet:
830873
logger.error("Fence agent %s is not configured", SBDManager.SBD_RA)
831874
return CheckResult.ERROR
832-
elif not xml_inst.is_resource_started(SBDManager.SBD_RA):
875+
elif not xml_inst.is_resource_started(SBDManager.SBD_RA) and not utils.is_cluster_in_maintenance_mode():
833876
if not self.quiet:
834877
logger.error("Fence agent %s is not started", SBDManager.SBD_RA)
835878
return CheckResult.ERROR
@@ -1130,12 +1173,12 @@ def init_and_deploy_sbd(self, restart_first=False):
11301173
# Only then should additional properties be configured,
11311174
# because the stonith-watchdog-timeout property requires sbd.service to be active.
11321175
restart_cluster_first = restart_first or \
1133-
(self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE))
1176+
not self.diskless_sbd or \
1177+
not ServiceManager().service_is_active(constants.SBD_SERVICE)
11341178
if restart_cluster_first:
11351179
bootstrap.restart_cluster()
11361180

1137-
self.configure_sbd()
1138-
bootstrap.adjust_properties(with_sbd=True)
1181+
bootstrap.adjust_properties()
11391182

11401183
# In other cases, it is better to restart the cluster
11411184
# after modifying SBD-related configurations.

crmsh/ui_cluster.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,7 @@ def do_health(self, context, *args):
839839
logger.error("hawk2: passwordless ssh authentication: FAIL.")
840840
logger.warning('Please run "crm cluster health hawk2 --fix"')
841841
return False
842+
842843
case 'sbd':
843844
fix = parsed_args.fix
844845
try:
@@ -850,16 +851,8 @@ def do_health(self, context, *args):
850851
logger.error('%s', e)
851852
logger.error('SBD: Check sbd timeout configuration: FAIL.')
852853
return False
853-
if result == sbd.CheckResult.ERROR:
854-
if not fix:
855-
logger.info('Please run "crm cluster health sbd --fix" to fix the above error')
856-
logger.error('SBD: Check sbd timeout configuration: FAIL.')
857-
return False
858-
else:
859-
if result == sbd.CheckResult.WARNING and not fix:
860-
logger.info('Please run "crm cluster health sbd --fix" to fix the above warning')
861-
logger.info('SBD: Check sbd timeout configuration: OK.')
862-
return True
854+
return sbd.SBDTimeoutChecker.log_and_return(result, fix)
855+
863856
case 'sles16':
864857
try:
865858
if parsed_args.fix:

crmsh/ui_sbd.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ def _configure_show(self, args) -> None:
252252
print()
253253
self._show_property()
254254

255+
print()
255256
return SBD.check_timeout_configurations()
256257

257258
@staticmethod
@@ -262,11 +263,7 @@ def check_timeout_configurations() -> bool:
262263
except sbd.FixAborted as e:
263264
logger.error('%s', e)
264265
return False
265-
if check_rc != sbd.CheckResult.SUCCESS:
266-
issue_type = "error" if check_rc == sbd.CheckResult.ERROR else "warning"
267-
logger.info('Please run "crm cluster health sbd --fix" to fix the above %s', issue_type)
268-
269-
return check_rc in (sbd.CheckResult.SUCCESS, sbd.CheckResult.WARNING)
266+
return sbd.SBDTimeoutChecker.log_and_return(check_rc)
270267

271268
def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]:
272269
'''

crmsh/utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2822,13 +2822,28 @@ def is_2node_cluster_without_qdevice():
28222822
return (current_num + qdevice_num) == 2
28232823

28242824

2825+
def get_pcmk_delay_max_configured_value() -> int:
2826+
out = sh.cluster_shell().get_stdout_or_raise_error("crm configure show related:stonith")
2827+
if not out:
2828+
return 0
2829+
pcmk_delay_max_v_list = re.findall(r"pcmk_delay_max=(\w+)", out)
2830+
if pcmk_delay_max_v_list:
2831+
return max([int(crm_msec(v)/1000) for v in pcmk_delay_max_v_list])
2832+
else:
2833+
return 0
2834+
2835+
28252836
def get_pcmk_delay_max(two_node_without_qdevice=False):
28262837
"""
28272838
Get value of pcmk_delay_max
28282839
"""
2840+
configured_value = get_pcmk_delay_max_configured_value()
2841+
if configured_value > 0:
2842+
return configured_value
28292843
if ServiceManager().service_is_active("pacemaker.service") and two_node_without_qdevice:
28302844
return constants.PCMK_DELAY_MAX
2831-
return 0
2845+
else:
2846+
return 0
28322847

28332848

28342849
def get_property(name, property_type="crm_config", peer=None, get_default=True):

crmsh/xmlutil.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,6 +1516,8 @@ class CrmMonXmlParser(object):
15161516
"""
15171517
Class to parse xml output of crm_mon
15181518
"""
1519+
NOT_CONNECTED_CODE = 102
1520+
15191521
def __init__(self, peer=None):
15201522
"""
15211523
Init function
@@ -1531,6 +1533,10 @@ def _load(self):
15311533
_, output, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(self.peer, constants.CRM_MON_XML_OUTPUT)
15321534
return text2elem(output) if output else None
15331535

1536+
def not_connected(self):
1537+
xpath = f"//status[@code='{self.NOT_CONNECTED_CODE}']"
1538+
return bool(self.xml_elem.xpath(xpath))
1539+
15341540
def with_quorum(self):
15351541
"""
15361542
Check if cluster is with quorum

0 commit comments

Comments
 (0)