Skip to content

Commit d1ba8e9

Browse files
committed
mgr/cephadm: adding automation for Prometheus config generation
Signed-off-by: Redouane Kachach <[email protected]>
1 parent 834323c commit d1ba8e9

File tree

5 files changed

+201
-366
lines changed

5 files changed

+201
-366
lines changed

src/pybind/mgr/cephadm/module.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,11 @@ def _init_cert_mgr(self) -> None:
743743

744744
self.cert_mgr.init_tlsobject_store()
745745

746+
def _get_mgr_ips(self) -> List[str]:
747+
return [self.inventory.get_addr(d.hostname)
748+
for d in self.cache.get_daemons_by_service('mgr')
749+
if d.hostname is not None]
750+
746751
def _get_security_config(self) -> Tuple[bool, bool, bool]:
747752
oauth2_proxy_enabled = len(self.cache.get_daemons_by_service('oauth2-proxy')) > 0
748753
mgmt_gw_enabled = len(self.cache.get_daemons_by_service('mgmt-gateway')) > 0

src/pybind/mgr/cephadm/services/monitoring.py

Lines changed: 96 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from mgr_module import HandleCommandResult
1111
from .service_registry import register_cephadm_service
12+
from cephadm.services.service_registry import service_registry
1213

1314
from orchestrator import DaemonDescription
1415
from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, \
@@ -24,6 +25,14 @@
2425
logger = logging.getLogger(__name__)
2526

2627

28+
def get_field_from_spec(spec: ServiceSpec, attr: str, default: Any) -> Any:
29+
try:
30+
value = getattr(spec, attr)
31+
return value if value else default
32+
except AttributeError:
33+
return default
34+
35+
2736
@register_cephadm_service
2837
class GrafanaService(CephadmService):
2938
TYPE = 'grafana'
@@ -484,6 +493,14 @@ class PrometheusService(CephadmService):
484493
USER_CFG_KEY = 'prometheus/web_user'
485494
PASS_CFG_KEY = 'prometheus/web_password'
486495

496+
def prepare_create(
497+
self,
498+
daemon_spec: CephadmDaemonDeploySpec,
499+
) -> CephadmDaemonDeploySpec:
500+
assert self.TYPE == daemon_spec.daemon_type
501+
daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
502+
return daemon_spec
503+
487504
def config(self, spec: ServiceSpec) -> None:
488505
# make sure module is enabled
489506
mgr_map = self.mgr.get('mgr_map')
@@ -501,70 +518,71 @@ def get_prometheus_certificates(self, daemon_spec: CephadmDaemonDeploySpec) -> T
501518
cert, key = self.mgr.cert_mgr.generate_cert([host_fqdn, 'prometheus_servers'], node_ip)
502519
return cert, key
503520

504-
def prepare_create(
505-
self,
506-
daemon_spec: CephadmDaemonDeploySpec,
507-
) -> CephadmDaemonDeploySpec:
508-
assert self.TYPE == daemon_spec.daemon_type
509-
daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
510-
return daemon_spec
521+
def get_service_discovery_cfg(self, security_enabled: bool, mgmt_gw_enabled: bool) -> Dict[str, List[str]]:
522+
"""
523+
Retrieves the service discovery URLs for the services that require monitoring
524+
525+
Returns:
526+
Dict[str, List[str]]: A dictionary where the keys represent service categories (e.g., "nfs", "node-exporterr") and
527+
the values are a list of service-discovery URLs used to get the corresponding service targets.
528+
"""
529+
if mgmt_gw_enabled:
530+
service_discovery_url_prefixes = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}']
531+
else:
532+
port = self.mgr.service_discovery_port
533+
protocol = 'https' if security_enabled else 'http'
534+
service_discovery_url_prefixes = [f'{protocol}://{wrap_ipv6(ip)}:{port}'
535+
for ip in self.mgr._get_mgr_ips()]
536+
return {
537+
service: [f'{prefix}/sd/prometheus/sd-config?service={service}' for prefix in service_discovery_url_prefixes]
538+
for service in service_registry.get_services_requiring_monitoring()
539+
if service == 'ceph'
540+
or bool(self.mgr.cache.get_daemons_by_service(service))
541+
or bool(self.mgr.cache.get_daemons_by_type(service))
542+
}
543+
544+
def configure_alerts(self, r: Dict) -> None:
545+
# include alerts, if present in the container
546+
if os.path.exists(self.mgr.prometheus_alerts_path):
547+
with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
548+
alerts = f.read()
549+
r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
550+
551+
# Include custom alerts if present in key value store. This enables the
552+
# users to add custom alerts. Write the file in any case, so that if the
553+
# content of the key value store changed, that file is overwritten
554+
# (emptied in case they value has been removed from the key value
555+
# store). This prevents the necessity to adapt `cephadm` binary to
556+
# remove the file.
557+
#
558+
# Don't use the template engine for it as
559+
#
560+
# 1. the alerts are always static and
561+
# 2. they are a template themselves for the Go template engine, which
562+
# use curly braces and escaping that is cumbersome and unnecessary
563+
# for the user.
564+
#
565+
r['files']['/etc/prometheus/alerting/custom_alerts.yml'] = \
566+
self.mgr.get_store('services/prometheus/alerting/custom_alerts.yml', '')
511567

512568
def generate_config(
513569
self,
514570
daemon_spec: CephadmDaemonDeploySpec,
515571
) -> Tuple[Dict[str, Any], List[str]]:
516572

517573
assert self.TYPE == daemon_spec.daemon_type
518-
spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
519-
try:
520-
retention_time = spec.retention_time if spec.retention_time else '15d'
521-
except AttributeError:
522-
retention_time = '15d'
523574

524-
try:
525-
targets = spec.targets
526-
except AttributeError:
527-
logger.warning('Prometheus targets not found in the spec. Using empty list.')
528-
targets = []
529-
530-
try:
531-
retention_size = spec.retention_size if spec.retention_size else '0'
532-
except AttributeError:
533-
# default to disabled
534-
retention_size = '0'
575+
spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
576+
retention_time = get_field_from_spec(spec, 'retention_time', '15d')
577+
retention_size = get_field_from_spec(spec, 'retention_size', '0')
578+
targets = get_field_from_spec(spec, 'targets', [])
535579

536580
# build service discovery end-point
537581
security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
538-
port = self.mgr.service_discovery_port
539-
mgr_addr = wrap_ipv6(self.mgr.get_mgr_ip())
540-
541-
protocol = 'https' if security_enabled else 'http'
542-
self.mgr.get_mgmt_gw_internal_endpoint()
543-
if mgmt_gw_enabled:
544-
service_discovery_url_prefix = f'{self.mgr.get_mgmt_gw_internal_endpoint()}'
545-
else:
546-
service_discovery_url_prefix = f'{protocol}://{mgr_addr}:{port}'
547-
srv_end_point = f'{service_discovery_url_prefix}/sd/prometheus/sd-config?'
548-
549-
node_exporter_cnt = len(self.mgr.cache.get_daemons_by_service('node-exporter'))
550-
alertmgr_cnt = len(self.mgr.cache.get_daemons_by_service('alertmanager'))
551-
haproxy_cnt = len(self.mgr.cache.get_daemons_by_type('ingress'))
552-
node_exporter_sd_url = f'{srv_end_point}service=node-exporter' if node_exporter_cnt > 0 else None
553-
alertmanager_sd_url = f'{srv_end_point}service=alertmanager' if alertmgr_cnt > 0 else None
554-
haproxy_sd_url = f'{srv_end_point}service=haproxy' if haproxy_cnt > 0 else None
555-
mgr_prometheus_sd_url = f'{srv_end_point}service=mgr-prometheus' # always included
556-
ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included
557-
nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included
558-
mgmt_gw_enabled = len(self.mgr.cache.get_daemons_by_service('mgmt-gateway')) > 0
559-
nfs_sd_url = f'{srv_end_point}service=nfs' # always included
560-
smb_sd_url = f'{srv_end_point}service=smb' # always included
561-
562582
alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
563-
prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
564583
federate_path = self.get_target_cluster_federate_path(targets)
565584
cluster_credentials: Dict[str, Any] = {}
566585
cluster_credentials_files: Dict[str, Any] = {'files': {}}
567-
FSID = self.mgr._cluster_fsid
568586
if targets:
569587
if 'dashboard' in self.mgr.get('mgr_map')['modules']:
570588
cluster_credentials_files, cluster_credentials = self.mgr.remote(
@@ -576,21 +594,14 @@ def generate_config(
576594
# generate the prometheus configuration
577595
context = {
578596
'alertmanager_url_prefix': '/alertmanager' if mgmt_gw_enabled else '/',
597+
'security_enabled': security_enabled,
579598
'alertmanager_web_user': alertmanager_user,
580599
'alertmanager_web_password': alertmanager_password,
581-
'security_enabled': security_enabled,
582600
'service_discovery_username': self.mgr.http_server.service_discovery.username,
583601
'service_discovery_password': self.mgr.http_server.service_discovery.password,
584-
'mgr_prometheus_sd_url': mgr_prometheus_sd_url,
585-
'node_exporter_sd_url': node_exporter_sd_url,
586-
'alertmanager_sd_url': alertmanager_sd_url,
587-
'haproxy_sd_url': haproxy_sd_url,
588-
'ceph_exporter_sd_url': ceph_exporter_sd_url,
589-
'nvmeof_sd_url': nvmeof_sd_url,
602+
'service_discovery_cfg': self.get_service_discovery_cfg(security_enabled, mgmt_gw_enabled),
590603
'external_prometheus_targets': targets,
591-
'cluster_fsid': FSID,
592-
'nfs_sd_url': nfs_sd_url,
593-
'smb_sd_url': smb_sd_url,
604+
'cluster_fsid': self.mgr._cluster_fsid,
594605
'clusters_credentials': cluster_credentials,
595606
'federate_path': federate_path
596607
}
@@ -600,69 +611,41 @@ def generate_config(
600611
assert daemon_spec.host is not None
601612
ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or ''
602613
if ip_to_bind_to:
603-
daemon_spec.port_ips = {str(port): ip_to_bind_to}
614+
daemon_spec.port_ips = {str(self.mgr.service_discovery_port): ip_to_bind_to}
604615

605-
web_context = {
606-
'enable_mtls': mgmt_gw_enabled,
607-
'enable_basic_auth': not oauth2_enabled,
608-
'prometheus_web_user': prometheus_user,
609-
'prometheus_web_password': password_hash(prometheus_password),
616+
files = {
617+
'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context)
618+
}
619+
r: Dict[str, Any] = {
620+
'files': files,
621+
'retention_time': retention_time,
622+
'retention_size': retention_size,
623+
'ip_to_bind_to': ip_to_bind_to,
624+
'use_url_prefix': mgmt_gw_enabled
610625
}
611-
612626
if security_enabled:
613627
# Following key/cert are needed for:
614628
# 1- run the prometheus server (web.yml config)
615629
# 2- use mTLS to scrape node-exporter (prometheus acts as client)
616630
# 3- use mTLS to send alerts to alertmanager (prometheus acts as client)
617-
cert, key = self.get_prometheus_certificates(daemon_spec)
618-
r: Dict[str, Any] = {
619-
'files': {
620-
'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context),
621-
'root_cert.pem': self.mgr.cert_mgr.get_root_ca(),
622-
'web.yml': self.mgr.template.render('services/prometheus/web.yml.j2', web_context),
623-
'prometheus.crt': cert,
624-
'prometheus.key': key,
625-
},
626-
'retention_time': retention_time,
627-
'retention_size': retention_size,
628-
'ip_to_bind_to': ip_to_bind_to,
629-
'web_config': '/etc/prometheus/web.yml',
630-
'use_url_prefix': mgmt_gw_enabled
631-
}
632-
r['files'].update(cluster_credentials_files['files'])
633-
else:
634-
r = {
635-
'files': {
636-
'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context)
637-
},
638-
'retention_time': retention_time,
639-
'retention_size': retention_size,
640-
'ip_to_bind_to': ip_to_bind_to,
641-
'use_url_prefix': mgmt_gw_enabled
631+
prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
632+
web_context = {
633+
'enable_mtls': mgmt_gw_enabled,
634+
'enable_basic_auth': not oauth2_enabled,
635+
'prometheus_web_user': prometheus_user,
636+
'prometheus_web_password': password_hash(prometheus_password),
642637
}
638+
cert, key = self.get_prometheus_certificates(daemon_spec)
639+
files.update({
640+
'root_cert.pem': self.mgr.cert_mgr.get_root_ca(),
641+
'web.yml': self.mgr.template.render('services/prometheus/web.yml.j2', web_context),
642+
'prometheus.crt': cert,
643+
'prometheus.key': key,
644+
**cluster_credentials_files['files']
645+
})
646+
r.update({'web_config': '/etc/prometheus/web.yml'})
643647

644-
# include alerts, if present in the container
645-
if os.path.exists(self.mgr.prometheus_alerts_path):
646-
with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
647-
alerts = f.read()
648-
r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
649-
650-
# Include custom alerts if present in key value store. This enables the
651-
# users to add custom alerts. Write the file in any case, so that if the
652-
# content of the key value store changed, that file is overwritten
653-
# (emptied in case they value has been removed from the key value
654-
# store). This prevents the necessity to adapt `cephadm` binary to
655-
# remove the file.
656-
#
657-
# Don't use the template engine for it as
658-
#
659-
# 1. the alerts are always static and
660-
# 2. they are a template themselves for the Go template engine, which
661-
# use curly braces and escaping that is cumbersome and unnecessary
662-
# for the user.
663-
#
664-
r['files']['/etc/prometheus/alerting/custom_alerts.yml'] = \
665-
self.mgr.get_store('services/prometheus/alerting/custom_alerts.yml', '')
648+
self.configure_alerts(r)
666649

667650
return r, self.get_dependencies(self.mgr)
668651

src/pybind/mgr/cephadm/services/service_discovery.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def index(self) -> str:
145145
<head><title>Cephadm HTTP Endpoint</title></head>
146146
<body>
147147
<h2>Cephadm Service Discovery Endpoints</h2>
148-
<p><a href='prometheus/sd-config?service=mgr-prometheus'>mgr/Prometheus http sd-config</a></p>
148+
<p><a href='prometheus/sd-config?service=ceph'>mgr/Prometheus http sd-config</a></p>
149149
<p><a href='prometheus/sd-config?service=alertmanager'>Alertmanager http sd-config</a></p>
150150
<p><a href='prometheus/sd-config?service=node-exporter'>Node exporter http sd-config</a></p>
151151
<p><a href='prometheus/sd-config?service=haproxy'>HAProxy http sd-config</a></p>
@@ -161,26 +161,23 @@ def index(self) -> str:
161161
@cherrypy.tools.json_out()
162162
def get_sd_config(self, service: str) -> List[Dict[str, Collection[str]]]:
163163
"""Return <http_sd_config> compatible prometheus config for the specified service."""
164-
if service == 'mgr-prometheus':
165-
return self.prometheus_sd_config()
166-
elif service == 'alertmanager':
167-
return self.alertmgr_sd_config()
168-
elif service == 'node-exporter':
169-
return self.node_exporter_sd_config()
170-
elif service == 'haproxy':
171-
return self.haproxy_sd_config()
172-
elif service == 'ceph-exporter':
173-
return self.ceph_exporter_sd_config()
174-
elif service == 'nvmeof':
175-
return self.nvmeof_sd_config()
176-
elif service == 'nfs':
177-
return self.nfs_sd_config()
178-
elif service == 'smb':
179-
return self.smb_sd_config()
180-
elif service.startswith("container"):
164+
165+
if service.startswith("container"):
181166
return self.container_sd_config(service)
182-
else:
183-
return []
167+
168+
service_to_config = {
169+
'mgr-prometheus': self.prometheus_sd_config,
170+
'ceph': self.prometheus_sd_config,
171+
'alertmanager': self.alertmgr_sd_config,
172+
'node-exporter': self.node_exporter_sd_config,
173+
'haproxy': self.haproxy_sd_config,
174+
'ingress': self.haproxy_sd_config,
175+
'ceph-exporter': self.ceph_exporter_sd_config,
176+
'nvmeof': self.nvmeof_sd_config,
177+
'nfs': self.nfs_sd_config,
178+
'smb': self.smb_sd_config,
179+
}
180+
return service_to_config.get(service, lambda: [])()
184181

185182
def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]:
186183
"""Return <http_sd_config> compatible prometheus config for prometheus service.

0 commit comments

Comments
 (0)