Skip to content

Commit 0392505

Browse files
committed
mgr/cephadm: adding HA support for mgmt-gateway and ouath2-proxy
adding HA support for mgmt-gateway and ouath2-proxy. In addition, logic is add to prevent unnecessary daemons restart during mgr failover. Previously, without the management gateway, some daemons, such as Prometheus, had an explicit dependency on the manager because we needed to point to the active manager. With the management gateway, this explicit dependency is no longer necessary, as it automatically handles routing requests to the active manager. Signed-off-by: Redouane Kachach <[email protected]>
1 parent d0db937 commit 0392505

File tree

9 files changed

+155
-84
lines changed

9 files changed

+155
-84
lines changed

src/pybind/mgr/cephadm/module.py

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -819,30 +819,33 @@ def _get_security_config(self) -> Tuple[bool, bool, bool]:
819819
security_enabled = self.secure_monitoring_stack or mgmt_gw_enabled
820820
return security_enabled, mgmt_gw_enabled, oauth2_proxy_enabled
821821

822-
def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
822+
def _get_mgmt_gw_endpoint(self, is_internal: bool) -> Optional[str]:
823823
mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
824824
if not mgmt_gw_daemons:
825825
return None
826826

827827
dd = mgmt_gw_daemons[0]
828828
assert dd.hostname is not None
829-
mgmt_gw_addr = self.get_fqdn(dd.hostname)
830-
mgmt_gw_internal_endpoint = build_url(scheme='https', host=mgmt_gw_addr, port=MgmtGatewayService.INTERNAL_SERVICE_PORT)
831-
return f'{mgmt_gw_internal_endpoint}/internal'
829+
mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
830+
mgmt_gw_addr = mgmt_gw_spec.virtual_ip if mgmt_gw_spec.virtual_ip is not None else self.get_fqdn(dd.hostname)
832831

833-
def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
834-
mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
835-
if not mgmt_gw_daemons:
836-
return None
832+
if is_internal:
833+
mgmt_gw_port: Optional[int] = MgmtGatewayService.INTERNAL_SERVICE_PORT
834+
protocol = 'https'
835+
endpoint_suffix = '/internal'
836+
else:
837+
mgmt_gw_port = dd.ports[0] if dd.ports else None
838+
protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
839+
endpoint_suffix = ''
837840

838-
dd = mgmt_gw_daemons[0]
839-
assert dd.hostname is not None
840-
mgmt_gw_port = dd.ports[0] if dd.ports else None
841-
mgmt_gw_addr = self.get_fqdn(dd.hostname)
842-
mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
843-
protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
844-
mgmt_gw_external_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
845-
return mgmt_gw_external_endpoint
841+
mgmt_gw_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
842+
return f'{mgmt_gw_endpoint}{endpoint_suffix}'
843+
844+
def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
845+
return self._get_mgmt_gw_endpoint(is_internal=True)
846+
847+
def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
848+
return self._get_mgmt_gw_endpoint(is_internal=False)
846849

847850
def _get_cephadm_binary_path(self) -> str:
848851
import hashlib
@@ -3001,8 +3004,16 @@ def get_daemon_names(daemons: List[str]) -> List[str]:
30013004
daemon_names.append(dd.name())
30023005
return daemon_names
30033006

3004-
alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
3005-
prometheus_user, prometheus_password = self._get_prometheus_credentials()
3007+
prom_cred_hash = None
3008+
alertmgr_cred_hash = None
3009+
security_enabled, mgmt_gw_enabled, _ = self._get_security_config()
3010+
if security_enabled:
3011+
alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
3012+
prometheus_user, prometheus_password = self._get_prometheus_credentials()
3013+
if prometheus_user and prometheus_password:
3014+
prom_cred_hash = f'{utils.md5_hash(prometheus_user + prometheus_password)}'
3015+
if alertmanager_user and alertmanager_password:
3016+
alertmgr_cred_hash = f'{utils.md5_hash(alertmanager_user + alertmanager_password)}'
30063017

30073018
deps = []
30083019
if daemon_type == 'haproxy':
@@ -3049,9 +3060,10 @@ def get_daemon_names(daemons: List[str]) -> List[str]:
30493060
else:
30503061
deps = [self.get_mgr_ip()]
30513062
elif daemon_type == 'prometheus':
3052-
# for prometheus we add the active mgr as an explicit dependency,
3053-
# this way we force a redeploy after a mgr failover
3054-
deps.append(self.get_active_mgr().name())
3063+
if not mgmt_gw_enabled:
3064+
# for prometheus we add the active mgr as an explicit dependency,
3065+
# this way we force a redeploy after a mgr failover
3066+
deps.append(self.get_active_mgr().name())
30553067
deps.append(str(self.get_module_option_ex('prometheus', 'server_port', 9283)))
30563068
deps.append(str(self.service_discovery_port))
30573069
# prometheus yaml configuration file (generated by prometheus.yml.j2) contains
@@ -3068,22 +3080,20 @@ def get_daemon_names(daemons: List[str]) -> List[str]:
30683080
deps += [d.name() for d in self.cache.get_daemons_by_service('ceph-exporter')]
30693081
deps += [d.name() for d in self.cache.get_daemons_by_service('mgmt-gateway')]
30703082
deps += [d.name() for d in self.cache.get_daemons_by_service('oauth2-proxy')]
3071-
security_enabled, _, _ = self._get_security_config()
3072-
if security_enabled:
3073-
if prometheus_user and prometheus_password:
3074-
deps.append(f'{hash(prometheus_user + prometheus_password)}')
3075-
if alertmanager_user and alertmanager_password:
3076-
deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
3083+
if prom_cred_hash is not None:
3084+
deps.append(prom_cred_hash)
3085+
if alertmgr_cred_hash is not None:
3086+
deps.append(alertmgr_cred_hash)
30773087
elif daemon_type == 'grafana':
30783088
deps += get_daemon_names(['prometheus', 'loki', 'mgmt-gateway', 'oauth2-proxy'])
3079-
security_enabled, _, _ = self._get_security_config()
3080-
if security_enabled and prometheus_user and prometheus_password:
3081-
deps.append(f'{hash(prometheus_user + prometheus_password)}')
3089+
if prom_cred_hash is not None:
3090+
deps.append(prom_cred_hash)
30823091
elif daemon_type == 'alertmanager':
3083-
deps += get_daemon_names(['mgr', 'alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
3084-
security_enabled, _, _ = self._get_security_config()
3085-
if security_enabled and alertmanager_user and alertmanager_password:
3086-
deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
3092+
deps += get_daemon_names(['alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
3093+
if not mgmt_gw_enabled:
3094+
deps += get_daemon_names(['mgr'])
3095+
if alertmgr_cred_hash is not None:
3096+
deps.append(alertmgr_cred_hash)
30873097
elif daemon_type == 'promtail':
30883098
deps += get_daemon_names(['loki'])
30893099
elif daemon_type in ['ceph-exporter', 'node-exporter']:
@@ -3097,7 +3107,7 @@ def get_daemon_names(daemons: List[str]) -> List[str]:
30973107
elif daemon_type == 'mgmt-gateway':
30983108
# url_prefix for monitoring daemons depends on the presence of mgmt-gateway
30993109
# while dashboard urls depend on the mgr daemons
3100-
deps += get_daemon_names(['mgr', 'grafana', 'prometheus', 'alertmanager', 'oauth2-proxy'])
3110+
deps += get_daemon_names(['grafana', 'prometheus', 'alertmanager', 'oauth2-proxy'])
31013111
else:
31023112
# this daemon type doesn't need deps mgmt
31033113
pass

src/pybind/mgr/cephadm/services/ingress.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,12 @@ def keepalived_generate_config(
241241
if spec.keepalived_password:
242242
password = spec.keepalived_password
243243

244-
daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
244+
if spec.keepalive_only:
245+
# when keepalive_only instead of haproxy, we have to monitor the backend service daemons
246+
if spec.backend_service is not None:
247+
daemons = self.mgr.cache.get_daemons_by_service(spec.backend_service)
248+
else:
249+
daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
245250

246251
if not daemons and not spec.keepalive_only:
247252
raise OrchestratorError(
@@ -297,6 +302,10 @@ def _get_valid_interface_and_ip(vip: str, host: str) -> Tuple[str, str]:
297302
port = d.ports[1] # monitoring port
298303
host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
299304
script = f'/usr/bin/curl {build_url(scheme="http", host=host_ip, port=port)}/health'
305+
elif d.daemon_type == 'mgmt-gateway':
306+
mgmt_gw_port = d.ports[0] if d.ports else None
307+
host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
308+
script = f'/usr/bin/curl -k {build_url(scheme="https", host=host_ip, port=mgmt_gw_port)}/health'
300309
assert script
301310

302311
states = []

src/pybind/mgr/cephadm/services/mgmt_gateway.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import logging
2-
from typing import List, Any, Tuple, Dict, cast, Optional
2+
from typing import List, Any, Tuple, Dict, cast, TYPE_CHECKING
33

44
from orchestrator import DaemonDescription
55
from ceph.deployment.service_spec import MgmtGatewaySpec, GrafanaSpec
@@ -36,10 +36,11 @@ def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDes
3636
# if empty list provided, return empty Daemon Desc
3737
return DaemonDescription()
3838

39-
def get_oauth2_service_url(self) -> Optional[str]:
40-
# TODO(redo): check how can we create several servers for HA
41-
oauth2_servers = self.get_service_endpoints('oauth2-proxy')
42-
return f'https://{oauth2_servers[0]}' if oauth2_servers else None
39+
def get_mgmt_gw_ips(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> List[str]:
40+
mgmt_gw_ips = [self.mgr.inventory.get_addr(daemon_spec.host)]
41+
if svc_spec.virtual_ip is not None:
42+
mgmt_gw_ips.append(svc_spec.virtual_ip)
43+
return mgmt_gw_ips
4344

4445
def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
4546
# we adjust the standby behaviour so rev-proxy can pick correctly the active instance
@@ -56,9 +57,9 @@ def get_external_certificates(self, svc_spec: MgmtGatewaySpec, daemon_spec: Ceph
5657
key = svc_spec.ssl_certificate_key
5758
else:
5859
# not provided on the spec, let's generate self-sigend certificates
59-
addr = self.mgr.inventory.get_addr(daemon_spec.host)
60+
ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
6061
host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
61-
cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, addr)
62+
cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
6263
# save certificates
6364
if cert and key:
6465
self.mgr.cert_key_store.save_cert('mgmt_gw_cert', cert)
@@ -67,10 +68,18 @@ def get_external_certificates(self, svc_spec: MgmtGatewaySpec, daemon_spec: Ceph
6768
logger.error("Failed to obtain certificate and key from mgmt-gateway.")
6869
return cert, key
6970

70-
def get_internal_certificates(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
71-
node_ip = self.mgr.inventory.get_addr(daemon_spec.host)
71+
def get_internal_certificates(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
72+
ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
7273
host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
73-
return self.mgr.cert_mgr.generate_cert(host_fqdn, node_ip)
74+
return self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
75+
76+
def get_service_discovery_endpoints(self) -> List[str]:
77+
sd_endpoints = []
78+
for dd in self.mgr.cache.get_daemons_by_service('mgr'):
79+
assert dd.hostname is not None
80+
addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
81+
sd_endpoints.append(f"{addr}:{self.mgr.service_discovery_port}")
82+
return sd_endpoints
7483

7584
def get_mgmt_gateway_deps(self) -> List[str]:
7685
# url_prefix for the following services depends on the presence of mgmt-gateway
@@ -79,10 +88,6 @@ def get_mgmt_gateway_deps(self) -> List[str]:
7988
deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('alertmanager')]
8089
deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('grafana')]
8190
deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')]
82-
for dd in self.mgr.cache.get_daemons_by_service('mgr'):
83-
# we consider mgr a dep even if the dashboard is disabled
84-
# in order to be consistent with _calc_daemon_deps().
85-
deps.append(dd.name())
8691

8792
return deps
8893

@@ -94,6 +99,8 @@ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[st
9499
prometheus_endpoints = self.get_service_endpoints('prometheus')
95100
alertmanager_endpoints = self.get_service_endpoints('alertmanager')
96101
grafana_endpoints = self.get_service_endpoints('grafana')
102+
oauth2_proxy_endpoints = self.get_service_endpoints('oauth2-proxy')
103+
service_discovery_endpoints = self.get_service_discovery_endpoints()
97104
try:
98105
grafana_spec = cast(GrafanaSpec, self.mgr.spec_store['grafana'].spec)
99106
grafana_protocol = grafana_spec.protocol
@@ -104,7 +111,9 @@ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[st
104111
'dashboard_endpoints': dashboard_endpoints,
105112
'prometheus_endpoints': prometheus_endpoints,
106113
'alertmanager_endpoints': alertmanager_endpoints,
107-
'grafana_endpoints': grafana_endpoints
114+
'grafana_endpoints': grafana_endpoints,
115+
'oauth2_proxy_endpoints': oauth2_proxy_endpoints,
116+
'service_discovery_endpoints': service_discovery_endpoints
108117
}
109118
server_context = {
110119
'spec': svc_spec,
@@ -117,11 +126,12 @@ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[st
117126
'prometheus_endpoints': prometheus_endpoints,
118127
'alertmanager_endpoints': alertmanager_endpoints,
119128
'grafana_endpoints': grafana_endpoints,
120-
'oauth2_proxy_url': self.get_oauth2_service_url(),
129+
'service_discovery_endpoints': service_discovery_endpoints,
130+
'enable_oauth2_proxy': bool(oauth2_proxy_endpoints),
121131
}
122132

123133
cert, key = self.get_external_certificates(svc_spec, daemon_spec)
124-
internal_cert, internal_pkey = self.get_internal_certificates(daemon_spec)
134+
internal_cert, internal_pkey = self.get_internal_certificates(svc_spec, daemon_spec)
125135
daemon_config = {
126136
"files": {
127137
"nginx.conf": self.mgr.template.render(self.SVC_TEMPLATE_PATH, main_context),

0 commit comments

Comments
 (0)