Skip to content

Commit 82b50b4

Browse files
author
Aashish Sharma
committed
mgr/dashboard: add prometheus federation config for mulkti-cluster
monitoring Signed-off-by: Aashish Sharma <[email protected]>
1 parent 733ea71 commit 82b50b4

File tree

7 files changed

+140
-3
lines changed

7 files changed

+140
-3
lines changed

src/pybind/mgr/cephadm/module.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
from cephadm.service_discovery import ServiceDiscovery
1818

19+
from ceph.deployment.service_spec import PrometheusSpec
20+
1921
import string
2022
from typing import List, Dict, Optional, Callable, Tuple, TypeVar, \
2123
Any, Set, TYPE_CHECKING, cast, NamedTuple, Sequence, Type, \
@@ -2980,6 +2982,38 @@ def set_prometheus_access_info(self, user: str, password: str) -> str:
29802982
self.set_store(PrometheusService.PASS_CFG_KEY, password)
29812983
return 'prometheus credentials updated correctly'
29822984

2985+
@handle_orch_error
2986+
def set_prometheus_target(self, url: str) -> str:
2987+
prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
2988+
if url not in prometheus_spec.targets:
2989+
prometheus_spec.targets.append(url)
2990+
else:
2991+
return f"Target '{url}' already exists.\n"
2992+
if not prometheus_spec:
2993+
return "Service prometheus not found\n"
2994+
daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus')
2995+
spec = ServiceSpec.from_json(prometheus_spec.to_json())
2996+
self.apply([spec], no_overwrite=False)
2997+
for daemon in daemons:
2998+
self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name)
2999+
return 'prometheus multi-cluster targets updated'
3000+
3001+
@handle_orch_error
3002+
def remove_prometheus_target(self, url: str) -> str:
3003+
prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
3004+
if url in prometheus_spec.targets:
3005+
prometheus_spec.targets.remove(url)
3006+
else:
3007+
return f"Target '{url}' does not exist.\n"
3008+
if not prometheus_spec:
3009+
return "Service prometheus not found\n"
3010+
daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus')
3011+
spec = ServiceSpec.from_json(prometheus_spec.to_json())
3012+
self.apply([spec], no_overwrite=False)
3013+
for daemon in daemons:
3014+
self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name)
3015+
return 'prometheus multi-cluster targets updated'
3016+
29833017
@handle_orch_error
29843018
def set_alertmanager_access_info(self, user: str, password: str) -> str:
29853019
self.set_store(AlertmanagerService.USER_CFG_KEY, user)

src/pybind/mgr/cephadm/services/monitoring.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,12 +376,17 @@ def generate_config(
376376

377377
assert self.TYPE == daemon_spec.daemon_type
378378
spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
379-
380379
try:
381380
retention_time = spec.retention_time if spec.retention_time else '15d'
382381
except AttributeError:
383382
retention_time = '15d'
384383

384+
try:
385+
targets = spec.targets
386+
except AttributeError:
387+
logger.warning('Prometheus targets not found in the spec. Using empty list.')
388+
targets = []
389+
385390
try:
386391
retention_size = spec.retention_size if spec.retention_size else '0'
387392
except AttributeError:
@@ -406,6 +411,7 @@ def generate_config(
406411

407412
alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
408413
prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
414+
FSID = self.mgr._cluster_fsid
409415

410416
# generate the prometheus configuration
411417
context = {
@@ -420,6 +426,8 @@ def generate_config(
420426
'haproxy_sd_url': haproxy_sd_url,
421427
'ceph_exporter_sd_url': ceph_exporter_sd_url,
422428
'nvmeof_sd_url': nvmeof_sd_url,
429+
'external_prometheus_targets': targets,
430+
'cluster_fsid': FSID
423431
}
424432

425433
web_context = {

src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
global:
33
scrape_interval: 10s
44
evaluation_interval: 10s
5+
{% if not secure_monitoring_stack %}
6+
external_labels:
7+
cluster: {{ cluster_fsid }}
8+
{% endif %}
9+
510
rule_files:
611
- /etc/prometheus/alerting/*
712

@@ -45,6 +50,10 @@ scrape_configs:
4550
ca_file: root_cert.pem
4651
{% else %}
4752
honor_labels: true
53+
relabel_configs:
54+
- source_labels: [__address__]
55+
target_label: cluster
56+
replacement: {{ cluster_fsid }}
4857
http_sd_configs:
4958
- url: {{ mgr_prometheus_sd_url }}
5059
{% endif %}
@@ -65,6 +74,10 @@ scrape_configs:
6574
{% else %}
6675
http_sd_configs:
6776
- url: {{ node_exporter_sd_url }}
77+
relabel_configs:
78+
- source_labels: [__address__]
79+
target_label: cluster
80+
replacement: {{ cluster_fsid }}
6881
{% endif %}
6982
{% endif %}
7083

@@ -84,6 +97,10 @@ scrape_configs:
8497
{% else %}
8598
http_sd_configs:
8699
- url: {{ haproxy_sd_url }}
100+
relabel_configs:
101+
- source_labels: [__address__]
102+
target_label: cluster
103+
replacement: {{ cluster_fsid }}
87104
{% endif %}
88105
{% endif %}
89106

@@ -103,6 +120,10 @@ scrape_configs:
103120
ca_file: root_cert.pem
104121
{% else %}
105122
honor_labels: true
123+
relabel_configs:
124+
- source_labels: [__address__]
125+
target_label: cluster
126+
replacement: {{ cluster_fsid }}
106127
http_sd_configs:
107128
- url: {{ ceph_exporter_sd_url }}
108129
{% endif %}
@@ -127,3 +148,19 @@ scrape_configs:
127148
- url: {{ nvmeof_sd_url }}
128149
{% endif %}
129150
{% endif %}
151+
152+
{% if not secure_monitoring_stack %}
153+
- job_name: 'federate'
154+
scrape_interval: 15s
155+
honor_labels: true
156+
metrics_path: '/federate'
157+
params:
158+
'match[]':
159+
- '{job="ceph"}'
160+
- '{job="node"}'
161+
- '{job="haproxy"}'
162+
- '{job="ceph-exporter"}'
163+
static_configs:
164+
- targets: {{ external_prometheus_targets }}
165+
{% endif %}
166+

src/pybind/mgr/cephadm/tests/test_services.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,9 @@ def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module:
692692
global:
693693
scrape_interval: 10s
694694
evaluation_interval: 10s
695+
external_labels:
696+
cluster: fsid
697+
695698
rule_files:
696699
- /etc/prometheus/alerting/*
697700
@@ -704,25 +707,54 @@ def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module:
704707
scrape_configs:
705708
- job_name: 'ceph'
706709
honor_labels: true
710+
relabel_configs:
711+
- source_labels: [__address__]
712+
target_label: cluster
713+
replacement: fsid
707714
http_sd_configs:
708715
- url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus
709716
710717
- job_name: 'node'
711718
http_sd_configs:
712719
- url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
720+
relabel_configs:
721+
- source_labels: [__address__]
722+
target_label: cluster
723+
replacement: fsid
713724
714725
- job_name: 'haproxy'
715726
http_sd_configs:
716727
- url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy
728+
relabel_configs:
729+
- source_labels: [__address__]
730+
target_label: cluster
731+
replacement: fsid
717732
718733
- job_name: 'ceph-exporter'
719734
honor_labels: true
735+
relabel_configs:
736+
- source_labels: [__address__]
737+
target_label: cluster
738+
replacement: fsid
720739
http_sd_configs:
721740
- url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter
722741
723742
- job_name: 'nvmeof'
724743
http_sd_configs:
725744
- url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
745+
746+
- job_name: 'federate'
747+
scrape_interval: 15s
748+
honor_labels: true
749+
metrics_path: '/federate'
750+
params:
751+
'match[]':
752+
- '{job="ceph"}'
753+
- '{job="node"}'
754+
- '{job="haproxy"}'
755+
- '{job="ceph-exporter"}'
756+
static_configs:
757+
- targets: []
726758
""").lstrip()
727759

728760
_run_cephadm.assert_called_with(
@@ -810,6 +842,7 @@ def gen_cert(host, addr):
810842
global:
811843
scrape_interval: 10s
812844
evaluation_interval: 10s
845+
813846
rule_files:
814847
- /etc/prometheus/alerting/*
815848
@@ -892,6 +925,7 @@ def gen_cert(host, addr):
892925
password: sd_password
893926
tls_config:
894927
ca_file: root_cert.pem
928+
895929
""").lstrip()
896930

897931
_run_cephadm.assert_called_with(

src/pybind/mgr/orchestrator/_interface.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,14 @@ def set_prometheus_access_info(self, user: str, password: str) -> OrchResult[str
775775
"""set prometheus access information"""
776776
raise NotImplementedError()
777777

778+
def set_prometheus_target(self, url: str) -> OrchResult[str]:
779+
"""set prometheus target for multi-cluster"""
780+
raise NotImplementedError()
781+
782+
def remove_prometheus_target(self, url: str) -> OrchResult[str]:
783+
"""remove prometheus target for multi-cluster"""
784+
raise NotImplementedError()
785+
778786
def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]:
779787
"""get alertmanager access information"""
780788
raise NotImplementedError()

src/pybind/mgr/orchestrator/module.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,18 @@ def _set_prometheus_access_info(self, username: Optional[str] = None, password:
11431143
except ArgumentError as e:
11441144
return HandleCommandResult(-errno.EINVAL, "", (str(e)))
11451145

1146+
@_cli_write_command('orch prometheus set-target')
1147+
def _set_prometheus_target(self, url: str) -> HandleCommandResult:
1148+
completion = self.set_prometheus_target(url)
1149+
result = raise_if_exception(completion)
1150+
return HandleCommandResult(stdout=json.dumps(result))
1151+
1152+
@_cli_write_command('orch prometheus remove-target')
1153+
def _remove_prometheus_target(self, url: str) -> HandleCommandResult:
1154+
completion = self.remove_prometheus_target(url)
1155+
result = raise_if_exception(completion)
1156+
return HandleCommandResult(stdout=json.dumps(result))
1157+
11461158
@_cli_write_command('orch alertmanager set-credentials')
11471159
def _set_alertmanager_access_info(self, username: Optional[str] = None, password: Optional[str] = None, inbuf: Optional[str] = None) -> HandleCommandResult:
11481160
try:

src/python-common/ceph/deployment/service_spec.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ def __init__(self,
818818
unmanaged: bool = False,
819819
preview_only: bool = False,
820820
networks: Optional[List[str]] = None,
821+
targets: Optional[List[str]] = None,
821822
extra_container_args: Optional[GeneralArgList] = None,
822823
extra_entrypoint_args: Optional[GeneralArgList] = None,
823824
custom_configs: Optional[List[CustomConfig]] = None,
@@ -854,6 +855,7 @@ def __init__(self,
854855
#: :ref:`cephadm-monitoring-networks-ports`,
855856
#: :ref:`cephadm-rgw-networks` and :ref:`cephadm-mgr-networks`.
856857
self.networks: List[str] = networks or []
858+
self.targets: List[str] = targets or []
857859

858860
self.config: Optional[Dict[str, str]] = None
859861
if config:
@@ -1733,6 +1735,7 @@ def __init__(self,
17331735
unmanaged: bool = False,
17341736
preview_only: bool = False,
17351737
port: Optional[int] = None,
1738+
targets: Optional[List[str]] = None,
17361739
extra_container_args: Optional[GeneralArgList] = None,
17371740
extra_entrypoint_args: Optional[GeneralArgList] = None,
17381741
custom_configs: Optional[List[CustomConfig]] = None,
@@ -1746,7 +1749,7 @@ def __init__(self,
17461749
preview_only=preview_only, config=config,
17471750
networks=networks, extra_container_args=extra_container_args,
17481751
extra_entrypoint_args=extra_entrypoint_args,
1749-
custom_configs=custom_configs)
1752+
custom_configs=custom_configs, targets=targets)
17501753

17511754
self.service_type = service_type
17521755
self.port = port
@@ -1881,6 +1884,7 @@ def __init__(self,
18811884
port: Optional[int] = None,
18821885
retention_time: Optional[str] = None,
18831886
retention_size: Optional[str] = None,
1887+
targets: Optional[List[str]] = None,
18841888
extra_container_args: Optional[GeneralArgList] = None,
18851889
extra_entrypoint_args: Optional[GeneralArgList] = None,
18861890
custom_configs: Optional[List[CustomConfig]] = None,
@@ -1889,7 +1893,7 @@ def __init__(self,
18891893
super(PrometheusSpec, self).__init__(
18901894
'prometheus', service_id=service_id,
18911895
placement=placement, unmanaged=unmanaged,
1892-
preview_only=preview_only, config=config, networks=networks, port=port,
1896+
preview_only=preview_only, config=config, networks=networks, port=port, targets=targets,
18931897
extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args,
18941898
custom_configs=custom_configs)
18951899

0 commit comments

Comments
 (0)