Skip to content

Commit fbc054b

Browse files
authored
Merge pull request ceph#65150 from rkachach/fix_issue_nvmeof_prometheus
mgr/cepahdm: fixing nvmeof scraping Prometheus config generation Reviewed-by: Kushal Deb <[email protected]>
2 parents a768156 + b184df7 commit fbc054b

File tree

4 files changed

+66
-17
lines changed

4 files changed

+66
-17
lines changed

src/pybind/mgr/cephadm/services/monitoring.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -522,24 +522,30 @@ def get_service_discovery_cfg(self, security_enabled: bool, mgmt_gw_enabled: boo
522522
"""
523523
Retrieves the service discovery URLs for the services that require monitoring
524524
525+
Note: we always add the 'ceph' Prometheus target as it corresponds to the prometheus-mgr module target
526+
525527
Returns:
526528
Dict[str, List[str]]: A dictionary where the keys represent service categories (e.g., "nfs", "node-exporterr") and
527529
the values are a list of service-discovery URLs used to get the corresponding service targets.
528530
"""
531+
532+
def sd_urls(svc: str, prefixes: List[str]) -> list[str]:
533+
return [f'{p}/sd/prometheus/sd-config?service={svc}' for p in prefixes]
534+
529535
if mgmt_gw_enabled:
530536
service_discovery_url_prefixes = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}']
531537
else:
532538
port = self.mgr.service_discovery_port
533539
protocol = 'https' if security_enabled else 'http'
534540
service_discovery_url_prefixes = [f'{protocol}://{wrap_ipv6(ip)}:{port}'
535541
for ip in self.mgr._get_mgr_ips()]
536-
return {
537-
service: [f'{prefix}/sd/prometheus/sd-config?service={service}' for prefix in service_discovery_url_prefixes]
538-
for service in service_registry.get_services_requiring_monitoring()
539-
if service == 'ceph'
540-
or bool(self.mgr.cache.get_daemons_by_service(service))
541-
or bool(self.mgr.cache.get_daemons_by_type(service))
542-
}
542+
543+
services_to_monitor = ['ceph', *(
544+
s for s in service_registry.get_services_requiring_monitoring()
545+
if self.mgr.cache.get_daemons_by_service(s) or self.mgr.cache.get_daemons_by_type(s)
546+
)]
547+
548+
return {s: sd_urls(s, service_discovery_url_prefixes) for s in services_to_monitor}
543549

544550
def configure_alerts(self, r: Dict) -> None:
545551
# include alerts, if present in the container
@@ -666,14 +672,12 @@ def get_dependencies(cls, mgr: "CephadmOrchestrator",
666672
deps.append(f'alert-cred:{utils.md5_hash(alertmanager_user + alertmanager_password)}')
667673

668674
# Adding other services as deps (with corresponding justification):
669-
# ceph-exporter: scraping target
670-
# node-exporter: scraping target
671-
# ingress : scraping target
672-
# alert-manager: part of prometheus configuration
673-
# mgmt-gateway : since url_prefix depends on the existence of mgmt-gateway
675+
# mgmt-gateway : url_prefix depends on the existence of mgmt-gateway
674676
# oauth2-proxy : enbling basic-auth (or not) depends on the existence of 'oauth2-proxy'
675-
for svc in ['mgmt-gateway', 'oauth2-proxy', 'alertmanager', 'node-exporter', 'ceph-exporter', 'ingress']:
676-
deps.append(f'{svc}_configured:{bool(mgr.cache.get_daemons_by_service(svc))}')
677+
prometheus_svc_deps = service_registry.get_services_requiring_monitoring() + ['mgmt-gateway', 'oauth2-proxy']
678+
for svc in prometheus_svc_deps:
679+
configured = bool(mgr.cache.get_daemons_by_service(svc)) or bool(mgr.cache.get_daemons_by_type(svc))
680+
deps.append(f'{svc}_configured:{configured}')
677681

678682
if not mgmt_gw_enabled:
679683
# Ceph mgrs are dependency because when mgmt-gateway is not enabled the service-discovery depends on mgrs ips

src/pybind/mgr/cephadm/services/nvmeof.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,14 @@ def daemon_check_post(self, daemon_descrs: List[DaemonDescription]) -> None:
158158

159159
def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
160160
def get_set_cmd_dicts(out: str) -> List[dict]:
161-
gateways = json.loads(out)['gateways']
162-
cmd_dicts = []
163161

162+
try:
163+
gateways = json.loads(out).get('gateways', [])
164+
except json.decoder.JSONDecodeError as e:
165+
logger.error(f'Error while trying to parse gateways JSON: {e}')
166+
return []
167+
168+
cmd_dicts = []
164169
for dd in daemon_descrs:
165170
spec = cast(NvmeofServiceSpec,
166171
self.mgr.spec_store.all_specs.get(dd.service_name(), None))

src/pybind/mgr/cephadm/services/service_registry.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def get_service(self, service_type: str) -> "CephadmService":
6969
def get_services_requiring_monitoring(self) -> List[str]:
7070
"""Return a list with service types that requiere monitoring."""
7171
services_to_monitor = [svc for svc in self._services if self._services[svc].needs_monitoring]
72-
services_to_monitor.append('ceph') # this is needed for mgr-prometheus targets
7372
return sorted(services_to_monitor)
7473

7574

src/pybind/mgr/cephadm/tests/test_services.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,6 +1145,8 @@ def test_node_exporter_config_with_mgmt_gw(
11451145
@patch("cephadm.module.CephadmOrchestrator._get_mgr_ips", lambda _: ['192.168.100.100', '::1'])
11461146
def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
11471147
_run_cephadm.side_effect = async_side_effect(('{}', '', 0))
1148+
pool = 'testpool'
1149+
group = 'mygroup'
11481150
s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast')
11491151
with with_host(cephadm_module, 'test'):
11501152
# host "test" needs to have networks for keepalive to be placed
@@ -1165,6 +1167,9 @@ def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module:
11651167
keepalived_password='12345',
11661168
virtual_ip="1.2.3.4/32",
11671169
backend_service='rgw.foo')) as _, \
1170+
with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}',
1171+
group=group,
1172+
pool=pool)) as _, \
11681173
with_service(cephadm_module, PrometheusSpec('prometheus',
11691174
networks=['1.2.3.0/24'],
11701175
only_bind_port_on_networks=True)) as _:
@@ -1231,6 +1236,16 @@ def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module:
12311236
- url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=node-exporter
12321237
- url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
12331238
1239+
- job_name: 'nvmeof'
1240+
relabel_configs:
1241+
- source_labels: [__address__]
1242+
target_label: cluster
1243+
replacement: fsid
1244+
honor_labels: true
1245+
http_sd_configs:
1246+
- url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=nvmeof
1247+
- url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
1248+
12341249
12351250
""").lstrip()
12361251

@@ -1282,6 +1297,8 @@ def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module:
12821297
def test_prometheus_config_security_enabled(self, _run_cephadm, _get_uname, cephadm_module: CephadmOrchestrator):
12831298
_run_cephadm.side_effect = async_side_effect(('{}', '', 0))
12841299
_get_uname.return_value = 'test'
1300+
pool = 'testpool'
1301+
group = 'mygroup'
12851302
s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast')
12861303
smb_spec = SMBSpec(cluster_id='foxtrot', config_uri='rados://.smb/foxtrot/config.json',)
12871304

@@ -1312,6 +1329,9 @@ def test_prometheus_config_security_enabled(self, _run_cephadm, _get_uname, ceph
13121329
keepalived_password='12345',
13131330
virtual_ip="1.2.3.4/32",
13141331
backend_service='rgw.foo')) as _, \
1332+
with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}',
1333+
group=group,
1334+
pool=pool)) as _, \
13151335
with_service(cephadm_module, PrometheusSpec('prometheus')) as _:
13161336

13171337
web_config = dedent("""
@@ -1442,6 +1462,27 @@ def test_prometheus_config_security_enabled(self, _run_cephadm, _get_uname, ceph
14421462
cert_file: prometheus.crt
14431463
key_file: prometheus.key
14441464
1465+
- job_name: 'nvmeof'
1466+
relabel_configs:
1467+
- source_labels: [__address__]
1468+
target_label: cluster
1469+
replacement: fsid
1470+
scheme: https
1471+
tls_config:
1472+
ca_file: root_cert.pem
1473+
cert_file: prometheus.crt
1474+
key_file: prometheus.key
1475+
honor_labels: true
1476+
http_sd_configs:
1477+
- url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
1478+
basic_auth:
1479+
username: sd_user
1480+
password: sd_password
1481+
tls_config:
1482+
ca_file: root_cert.pem
1483+
cert_file: prometheus.crt
1484+
key_file: prometheus.key
1485+
14451486
- job_name: 'smb'
14461487
relabel_configs:
14471488
- source_labels: [__address__]

0 commit comments

Comments
 (0)