Skip to content

Commit 9ed7f93

Browse files
committed
prometheus: Add Cephadm orch ps output metric to prometheus
Fixes: https://tracker.ceph.com/issues/72496 Signed-off-by: Ankush Behl <[email protected]>
1 parent b99a246 commit 9ed7f93

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

src/pybind/mgr/prometheus/module.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ def health_status_to_number(status: str) -> int:
115115
SMB_METADATA = ('smb_version', 'volume',
116116
'subvolume_group', 'subvolume', 'netbiosname', 'share')
117117

118+
CEPHADM_DAEMON_STATUS = ('service_type', 'daemon_name', 'hostname', 'service_name')
119+
118120
alert_metric = namedtuple('alert_metric', 'name description')
119121
HEALTH_CHECKS = [
120122
alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
@@ -803,6 +805,12 @@ def _setup_static_metrics(self) -> Dict[str, Metric]:
803805
'SMB Metadata',
804806
SMB_METADATA
805807
)
808+
metrics['cephadm_daemon_status'] = Metric(
809+
'gauge',
810+
'cephadm_daemon_status',
811+
'Status of cephadm daemons (0=stopped, 1=running, 2=errored)',
812+
CEPHADM_DAEMON_STATUS
813+
)
806814

807815
for flag in OSD_FLAGS:
808816
path = 'osd_flag_{}'.format(flag)
@@ -993,6 +1001,30 @@ def get_pool_stats(self) -> None:
9931001
(pool['pool_id'],)
9941002
)
9951003

1004+
@profile_method()
1005+
def set_cephadm_daemon_status_metrics(self) -> None:
1006+
try:
1007+
daemons = raise_if_exception(self.list_daemons())
1008+
for daemon in daemons:
1009+
service_type = getattr(daemon, 'daemon_type', '')
1010+
daemon_name = getattr(daemon, 'daemon_name', '')
1011+
hostname = str(getattr(daemon, 'hostname', ''))
1012+
status = getattr(daemon, 'status', '')
1013+
service_name_attr = getattr(daemon, 'service_name', '')
1014+
service_name = service_name_attr() if callable(service_name_attr) else str(service_name_attr)
1015+
1016+
self.metrics['cephadm_daemon_status'].set(
1017+
int(status),
1018+
(
1019+
service_type,
1020+
daemon_name,
1021+
hostname,
1022+
service_name,
1023+
)
1024+
)
1025+
except Exception as e:
1026+
self.log.error(f"Failed to collect cephadm daemon status: {e}")
1027+
9961028
@profile_method()
9971029
def get_df(self) -> None:
9981030
# maybe get the to-be-exported metrics from a config?
@@ -1840,6 +1872,7 @@ def collect(self) -> str:
18401872
self.get_num_objects()
18411873
self.get_all_daemon_health_metrics()
18421874
self.get_smb_metadata()
1875+
self.set_cephadm_daemon_status_metrics()
18431876

18441877
if not self.get_module_option('exclude_perf_counters'):
18451878
self.get_perf_counters()

0 commit comments

Comments
 (0)