@@ -115,6 +115,8 @@ def health_status_to_number(status: str) -> int:
115115SMB_METADATA = ('smb_version' , 'volume' ,
116116 'subvolume_group' , 'subvolume' , 'netbiosname' , 'share' )
117117
118+ CEPHADM_DAEMON_STATUS = ('service_type' , 'daemon_name' , 'hostname' , 'service_name' )
119+
118120alert_metric = namedtuple ('alert_metric' , 'name description' )
119121HEALTH_CHECKS = [
120122 alert_metric ('SLOW_OPS' , 'OSD or Monitor requests taking a long time to process' ),
@@ -803,6 +805,12 @@ def _setup_static_metrics(self) -> Dict[str, Metric]:
803805 'SMB Metadata' ,
804806 SMB_METADATA
805807 )
808+ metrics ['cephadm_daemon_status' ] = Metric (
809+ 'gauge' ,
810+ 'cephadm_daemon_status' ,
811+ 'Status of cephadm daemons (0=stopped, 1=running, 2=errored)' ,
812+ CEPHADM_DAEMON_STATUS
813+ )
806814
807815 for flag in OSD_FLAGS :
808816 path = 'osd_flag_{}' .format (flag )
@@ -993,6 +1001,30 @@ def get_pool_stats(self) -> None:
9931001 (pool ['pool_id' ],)
9941002 )
9951003
1004+ @profile_method ()
1005+ def set_cephadm_daemon_status_metrics (self ) -> None :
1006+ try :
1007+ daemons = raise_if_exception (self .list_daemons ())
1008+ for daemon in daemons :
1009+ service_type = getattr (daemon , 'daemon_type' , '' )
1010+ daemon_name = getattr (daemon , 'daemon_name' , '' )
1011+ hostname = str (getattr (daemon , 'hostname' , '' ))
1012+ status = getattr (daemon , 'status' , '' )
1013+ service_name_attr = getattr (daemon , 'service_name' , '' )
1014+ service_name = service_name_attr () if callable (service_name_attr ) else str (service_name_attr )
1015+
1016+ self .metrics ['cephadm_daemon_status' ].set (
1017+ int (status ),
1018+ (
1019+ service_type ,
1020+ daemon_name ,
1021+ hostname ,
1022+ service_name ,
1023+ )
1024+ )
1025+ except Exception as e :
1026+ self .log .error (f"Failed to collect cephadm daemon status: { e } " )
1027+
9961028 @profile_method ()
9971029 def get_df (self ) -> None :
9981030 # maybe get the to-be-exported metrics from a config?
@@ -1840,6 +1872,7 @@ def collect(self) -> str:
18401872 self .get_num_objects ()
18411873 self .get_all_daemon_health_metrics ()
18421874 self .get_smb_metadata ()
1875+ self .set_cephadm_daemon_status_metrics ()
18431876
18441877 if not self .get_module_option ('exclude_perf_counters' ):
18451878 self .get_perf_counters ()
0 commit comments