Skip to content

Commit 798b946

Browse files
committed
cephadm: Add pre_remove and ensure deployment values are reset and API settings are updated when removing Prometheus or Alertmanager daemons
This fixes an issue where the dashboard API settings are not updated properly when the active Prometheus or Alertmanager daemon is removed. If the active daemon is removed, the settings are reconfigured to point to a remaining daemon or reset if no daemons are available. This avoids dashboard errors like "404 Not Found" caused by stale API host settings. Signed-off-by: Kushal Deb <[email protected]>
1 parent bceddc4 commit 798b946

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed

src/pybind/mgr/cephadm/services/monitoring.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import socket
55
from typing import List, Any, Tuple, Dict, Optional, cast, TYPE_CHECKING
66
import ipaddress
7+
import time
8+
import requests
79

810
from mgr_module import HandleCommandResult
911

@@ -442,6 +444,36 @@ def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
442444
service_url
443445
)
444446

447+
def pre_remove(self, daemon: DaemonDescription) -> None:
448+
"""
449+
Called before Alertmanager is removed
450+
"""
451+
if daemon.hostname is None:
452+
return
453+
try:
454+
current_api_host = self.mgr.check_mon_command({"prefix": "dashboard get-alertmanager-api-host"}).stdout.strip()
455+
daemon_addr = daemon.ip if daemon.ip else self.mgr.get_fqdn(daemon.hostname)
456+
daemon_port = daemon.ports[0] if daemon.ports else self.DEFAULT_SERVICE_PORT
457+
service_url = build_url(scheme='http', host=daemon_addr, port=daemon_port)
458+
459+
if current_api_host == service_url:
460+
# This is the active daemon, update or reset the settings
461+
remaining_daemons = [
462+
d for d in self.mgr.cache.get_daemons_by_service(self.TYPE)
463+
if d.name() != daemon.name()
464+
]
465+
if remaining_daemons:
466+
self.config_dashboard(remaining_daemons)
467+
logger.info("Updated dashboard API settings to point to a remaining Alertmanager daemon")
468+
else:
469+
self.mgr.check_mon_command({"prefix": "dashboard reset-alertmanager-api-host"})
470+
self.mgr.check_mon_command({"prefix": "dashboard reset-alertmanager-api-ssl-verify"})
471+
logger.info("Reset dashboard API settings as no Alertmnager daemons are remaining")
472+
else:
473+
logger.info(f"Alertmanager {daemon.name()} removed; no changes to dashboard API settings")
474+
except Exception as e:
475+
logger.error(f"Error in Alertmanager pre_remove: {str(e)}")
476+
445477
def ok_to_stop(self,
446478
daemon_ids: List[str],
447479
force: bool = False,
@@ -716,6 +748,48 @@ def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
716748
service_url
717749
)
718750

751+
def pre_remove(self, daemon: DaemonDescription) -> None:
752+
"""
753+
Called before Prometheus daemon is removed
754+
"""
755+
MAX_RETRIES = 5
756+
RETRY_INTERVAL = 5
757+
if daemon.hostname is None:
758+
return
759+
try:
760+
current_api_host = self.mgr.check_mon_command({"prefix": "dashboard get-prometheus-api-host"}).stdout.strip()
761+
daemon_addr = daemon.ip if daemon.ip else self.mgr.get_fqdn(daemon.hostname)
762+
daemon_port = daemon.ports[0] if daemon.ports else self.DEFAULT_SERVICE_PORT
763+
service_url = build_url(scheme="http", host=daemon_addr, port=daemon_port)
764+
765+
if current_api_host == service_url:
766+
remaining_daemons = [
767+
d for d in self.mgr.cache.get_daemons_by_service(self.TYPE)
768+
if d.name() != daemon.name()
769+
]
770+
if remaining_daemons:
771+
self.config_dashboard(remaining_daemons)
772+
logger.info("Updated Dashboard Settings to point to remaining Prometheus daemons")
773+
for attempt in range(MAX_RETRIES):
774+
try:
775+
response = requests.get(f"{service_url}/api/v1/rules", timeout=5)
776+
if response.status_code == 200:
777+
logger.info(f"Prometheus daemon is ready at {service_url}.")
778+
break
779+
except Exception as e:
780+
logger.info(f"Retry {attempt + 1}: Waiting for Prometheus daemon at {service_url}: {e}")
781+
time.sleep(RETRY_INTERVAL)
782+
else:
783+
logger.warning("Prometheus daemon did not become ready after retries.")
784+
else:
785+
self.mgr.check_mon_command({"prefix": "dashboard reset-prometheus-api-host"})
786+
self.mgr.check_mon_command({"prefix": "dashboard reset-prometheus-api-ssl-verify"})
787+
logger.info("Reset Prometheus API settings as no daemons are remaining")
788+
else:
789+
logger.info("Prometheus daemon removed; no changes to dashboard API settings")
790+
except Exception as e:
791+
logger.error(f"Error in Prometheus pre_remove {str(e)}")
792+
719793
def ok_to_stop(self,
720794
daemon_ids: List[str],
721795
force: bool = False,

0 commit comments

Comments
 (0)