1111from . import corosync
1212from . import xmlutil
1313from . import watchdog
14+ from . import cibquery
1415from .service_manager import ServiceManager
1516from .sh import ShellUtils
1617
@@ -176,6 +177,15 @@ def handle_input_sbd_devices(dev_list, dev_list_from_config=None):
176177
177178 return overwrite_list , no_overwrite_list
178179
180+ @staticmethod
181+ def diskbased_sbd_configured () -> bool :
182+ return bool (SBDUtils .get_sbd_device_from_config ())
183+
184+ @staticmethod
185+ def diskless_sbd_configured () -> bool :
186+ value = utils .get_property ("stonith-watchdog-timeout" )
187+ return value and utils .crm_msec (value ) > 0
188+
179189
180190class SBDTimeout (object ):
181191 '''
@@ -448,11 +458,10 @@ class CheckResult(Enum):
448458
449459class SBDTimeoutChecker (SBDTimeout ):
450460
451- def __init__ (self , quiet = False , fix = False , from_bootstrap = False ):
461+ def __init__ (self , quiet = False , fix = False ):
452462 super ().__init__ ()
453463 self .quiet = quiet
454464 self .fix = fix
455- self .from_bootstrap = from_bootstrap
456465 self .peer_node_list = []
457466 self .service_disabled_node_list = []
458467
@@ -465,6 +474,22 @@ def _return_helper(check_res_list: list[CheckResult]) -> CheckResult:
465474 else :
466475 return CheckResult .WARNING
467476
477+ @staticmethod
478+ def log_and_return (check_res : CheckResult , fix_flag : bool = False ) -> bool :
479+ if check_res == CheckResult .SUCCESS :
480+ logger .info ('SBD: Check sbd timeout configuration: OK.' )
481+ return True
482+ cmd = "crm cluster health sbd --fix"
483+ issue_type = "error" if check_res == CheckResult .ERROR else "warning"
484+ if not fix_flag :
485+ logger .info (f'Please run "{ cmd } " to fix the above { issue_type } on the running cluster' )
486+ if check_res == CheckResult .ERROR :
487+ logger .error ("SBD: Check sbd timeout configuration: FAIL." )
488+ return False
489+ elif check_res == CheckResult .WARNING :
490+ logger .info ('SBD: Check sbd timeout configuration: OK.' )
491+ return True
492+
468493 def check_and_fix (self ) -> CheckResult :
469494 checks_and_fixes = [
470495 # issue name, check method, fix method, SSH required, prerequisites checks
@@ -549,8 +574,11 @@ def check_and_fix(self) -> CheckResult:
549574 )
550575 ]
551576
552- if not self .from_bootstrap and not ServiceManager ().service_is_active (constants .SBD_SERVICE ):
553- raise FixAborted ("%s is not active, skip SBD timeout checks" % constants .SBD_SERVICE )
577+ if not ServiceManager ().service_is_active (constants .SBD_SERVICE ):
578+ if self .fix :
579+ raise FixAborted ("%s is not active, skip fixing SBD timeout issues" % constants .SBD_SERVICE )
580+ elif not SBDUtils .diskbased_sbd_configured () and not SBDUtils .diskless_sbd_configured ():
581+ raise FixAborted ("Neither disk-based nor disk-less SBD is configured, skip checking SBD timeout issues" )
554582
555583 all_nodes_reachable = True
556584 self .peer_node_list = utils .list_cluster_nodes_except_me ()
@@ -592,9 +620,6 @@ def check_and_fix(self) -> CheckResult:
592620
593621 def _check_config_consistency (self , error_msg : str = "" ) -> bool :
594622 consistent = True
595- # Don't check consistency during bootstrap process
596- if self .from_bootstrap :
597- return consistent
598623
599624 if not self .peer_node_list :
600625 if error_msg :
@@ -636,7 +661,7 @@ def _check_sbd_disk_metadata(self) -> CheckResult:
636661 '''
637662 For disk-based SBD, check if the sbd msgwait and watchdog timeout are below expected values
638663 '''
639- if self .disk_based and not self . from_bootstrap :
664+ if self .disk_based :
640665 self .sbd_watchdog_timeout_expected , self .sbd_msgwait_expected = SBDTimeout .get_sbd_metadata_expected ()
641666 if self .sbd_watchdog_timeout < self .sbd_watchdog_timeout_expected :
642667 if not self .quiet :
@@ -661,7 +686,7 @@ def _check_sbd_watchdog_timeout(self) -> CheckResult:
661686 '''
662687 For diskless SBD, check if SBD_WATCHDOG_TIMEOUT is below expected value
663688 '''
664- if not self .disk_based and not self . from_bootstrap :
689+ if not self .disk_based :
665690 self .sbd_watchdog_timeout_expected = SBDTimeout .get_sbd_watchdog_timeout_expected (diskless = True )
666691 if self .sbd_watchdog_timeout < self .sbd_watchdog_timeout_expected :
667692 if not self .quiet :
@@ -731,22 +756,28 @@ def _fix_sbd_systemd_start_timeout(self):
731756 utils .cluster_run_cmd ("systemctl daemon-reload" )
732757
733758 def _check_stonith_watchdog_timeout (self ) -> CheckResult :
734- value = utils .get_property ("stonith-watchdog-timeout" , get_default = False )
759+ value = utils .get_property ("stonith-watchdog-timeout" )
760+ value = int (utils .crm_msec (value )/ 1000 )
735761 if self .disk_based :
736- if value :
762+ if value > 0 :
737763 if not self .quiet :
738764 logger .warning ("It's recommended that stonith-watchdog-timeout is not set when using disk-based SBD" )
739765 return CheckResult .WARNING
740766 else :
741- if not value or int (value ) < self .stonith_watchdog_timeout :
767+ if value == 0 :
768+ if not self .quiet :
769+ logger .error ("It's recommended that stonith-watchdog-timeout is set to %d, now is not set" ,
770+ self .stonith_watchdog_timeout )
771+ return CheckResult .ERROR
772+ if value < self .stonith_watchdog_timeout :
742773 if not self .quiet :
743- logger .error ("It's recommended that stonith-watchdog-timeout is set to %d, now is %s " ,
744- self .stonith_watchdog_timeout , value if value else "not set" )
774+ logger .error ("It's recommended that stonith-watchdog-timeout is set to %d, now is %d " ,
775+ self .stonith_watchdog_timeout , value )
745776 return CheckResult .ERROR
746- elif int ( value ) > self .stonith_watchdog_timeout :
777+ elif value > self .stonith_watchdog_timeout :
747778 if not self .quiet :
748779 logger .warning ("It's recommended that stonith-watchdog-timeout is set to %d, now is %d" ,
749- self .stonith_watchdog_timeout , int ( value ) )
780+ self .stonith_watchdog_timeout , value )
750781 return CheckResult .WARNING
751782 return CheckResult .SUCCESS
752783
@@ -760,16 +791,18 @@ def _fix_stonith_watchdog_timeout(self):
760791
761792 def _check_stonith_timeout (self ) -> CheckResult :
762793 expected_value = self .get_stonith_timeout_expected ()
763- value = utils .get_property ("stonith-timeout" , get_default = False )
764- if not value or int (value ) < expected_value :
794+ value = utils .get_property ("stonith-timeout" )
795+ # will get default value from pacemaker metadata if not set
796+ value = int (utils .crm_msec (value )/ 1000 )
797+ if value < expected_value :
765798 if not self .quiet :
766- logger .error ("It's recommended that stonith-timeout is set to %d, now is %s " ,
767- expected_value , value if value else "not set" )
799+ logger .error ("It's recommended that stonith-timeout is set to %d, now is %d " ,
800+ expected_value , value )
768801 return CheckResult .ERROR
769- elif int ( value ) > expected_value :
802+ elif value > expected_value :
770803 if not self .quiet :
771804 logger .warning ("It's recommended that stonith-timeout is set to %d, now is %d" ,
772- expected_value , int ( value ) )
805+ expected_value , value )
773806 return CheckResult .WARNING
774807 return CheckResult .SUCCESS
775808
@@ -822,14 +855,24 @@ def _fix_sbd_service_is_enabled(self):
822855 service_manager .enable_service (constants .SBD_SERVICE , node )
823856
824857 def _check_fence_sbd (self ) -> CheckResult :
825- if not self .disk_based or self . from_bootstrap :
858+ if not self .disk_based :
826859 return CheckResult .SUCCESS
827860 xml_inst = xmlutil .CrmMonXmlParser ()
861+ if xml_inst .not_connected ():
862+ cib = xmlutil .text2elem (sh .cluster_shell ().get_stdout_or_raise_error ("crm configure show xml" ))
863+ ra = cibquery .ResourceAgent ("stonith" , "" , "fence_sbd" )
864+ configured = cibquery .get_primitives_with_ra (cib , ra )
865+ if configured :
866+ return CheckResult .SUCCESS
867+ else :
868+ if not self .quiet :
869+ logger .error ("Fence agent %s is not configured" , SBDManager .SBD_RA )
870+ return CheckResult .ERROR
828871 if not xml_inst .is_resource_configured (SBDManager .SBD_RA ):
829872 if not self .quiet :
830873 logger .error ("Fence agent %s is not configured" , SBDManager .SBD_RA )
831874 return CheckResult .ERROR
832- elif not xml_inst .is_resource_started (SBDManager .SBD_RA ):
875+ elif not xml_inst .is_resource_started (SBDManager .SBD_RA ) and not utils . is_cluster_in_maintenance_mode () :
833876 if not self .quiet :
834877 logger .error ("Fence agent %s is not started" , SBDManager .SBD_RA )
835878 return CheckResult .ERROR
@@ -1130,12 +1173,12 @@ def init_and_deploy_sbd(self, restart_first=False):
11301173 # Only then should additional properties be configured,
11311174 # because the stonith-watchdog-timeout property requires sbd.service to be active.
11321175 restart_cluster_first = restart_first or \
1133- (self .diskless_sbd and not ServiceManager ().service_is_active (constants .SBD_SERVICE ))
1176+ not self .diskless_sbd or \
1177+ not ServiceManager ().service_is_active (constants .SBD_SERVICE )
11341178 if restart_cluster_first :
11351179 bootstrap .restart_cluster ()
11361180
1137- self .configure_sbd ()
1138- bootstrap .adjust_properties (with_sbd = True )
1181+ bootstrap .adjust_properties ()
11391182
11401183 # In other cases, it is better to restart the cluster
11411184 # after modifying SBD-related configurations.
0 commit comments