Skip to content

Commit 0a8990d

Browse files
authored
Merge pull request ceph#64763 from adk3798/cephadm-limit-list-servers-calls
mgr/cephadm: limit calls to list_servers Reviewed-by: John Mulligan <[email protected]>
2 parents 9d81862 + 726bb5a commit 0a8990d

File tree

4 files changed

+39
-27
lines changed

4 files changed

+39
-27
lines changed

src/pybind/mgr/cephadm/module.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
204204
default=10 * 60,
205205
desc='how frequently to perform a host check',
206206
),
207+
Option(
208+
'stray_daemon_check_interval',
209+
type='secs',
210+
default=30 * 60,
211+
desc='how frequently cephadm should check for the presence of stray daemons',
212+
),
207213
Option(
208214
'mode',
209215
type='str',
@@ -519,6 +525,7 @@ def __init__(self, *args: Any, **kwargs: Any):
519525
self.daemon_cache_timeout = 0
520526
self.facts_cache_timeout = 0
521527
self.host_check_interval = 0
528+
self.stray_daemon_check_interval = 0
522529
self.max_count_per_host = 0
523530
self.mode = ''
524531
self.container_image_base = ''
@@ -693,6 +700,8 @@ def __init__(self, *args: Any, **kwargs: Any):
693700

694701
self.ceph_volume: CephVolume = CephVolume(self)
695702

703+
self.last_stray_daemon_check: Optional[datetime.datetime] = None
704+
696705
def shutdown(self) -> None:
697706
self.log.debug('shutdown')
698707
self._worker_pool.close()

src/pybind/mgr/cephadm/serve.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime
1+
import datetime
22
import ipaddress
33
import hashlib
44
import json
@@ -64,7 +64,7 @@ class CephadmServe:
6464
def __init__(self, mgr: "CephadmOrchestrator"):
6565
self.mgr: "CephadmOrchestrator" = mgr
6666
self.log = logger
67-
self.last_certificates_check: Optional[datetime] = None
67+
self.last_certificates_check: Optional[datetime.datetime] = None
6868

6969
def serve(self) -> None:
7070
"""
@@ -173,6 +173,7 @@ def _serve_sleep(self) -> None:
173173
self.mgr.facts_cache_timeout,
174174
self.mgr.daemon_cache_timeout,
175175
self.mgr.device_cache_timeout,
176+
self.mgr.stray_daemon_check_interval,
176177
)
177178
)
178179
self.log.debug('Sleeping for %d seconds', sleep_interval)
@@ -466,6 +467,9 @@ def _run_async_actions(self) -> None:
466467
(self.mgr.scheduled_async_actions.pop(0))()
467468

468469
def _check_for_strays(self) -> None:
470+
cutoff = datetime_now() - datetime.timedelta(seconds=self.mgr.stray_daemon_check_interval)
471+
if self.mgr.last_stray_daemon_check is not None and self.mgr.last_stray_daemon_check >= cutoff:
472+
return
469473
self.log.debug('_check_for_strays')
470474
for k in ['CEPHADM_STRAY_HOST',
471475
'CEPHADM_STRAY_DAEMON']:
@@ -516,6 +520,7 @@ def _check_for_strays(self) -> None:
516520
if self.mgr.warn_on_stray_daemons and daemon_detail:
517521
self.mgr.set_health_warning(
518522
'CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail)
523+
self.mgr.last_stray_daemon_check = datetime_now()
519524

520525
def _service_reference_name(self, service_type: str, daemon_id: str) -> str:
521526
if service_type not in ['rbd-mirror', 'cephfs-mirror', 'rgw', 'rgw-nfs']:

src/pybind/mgr/cephadm/service_discovery.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ class Server: # type: ignore
99
import logging
1010

1111
import orchestrator # noqa
12-
from mgr_module import ServiceInfoT
1312
from mgr_util import build_url
1413
from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO
1514
from cephadm.services.nfs import NFSService
@@ -184,17 +183,16 @@ def get_sd_config(self, service: str) -> List[Dict[str, Collection[str]]]:
184183
return []
185184

186185
def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]:
187-
"""Return <http_sd_config> compatible prometheus config for prometheus service."""
188-
servers = self.mgr.list_servers()
186+
"""Return <http_sd_config> compatible prometheus config for prometheus service.
187+
Targets should be a length one list containing only the active mgr
188+
"""
189189
targets = []
190-
for server in servers:
191-
hostname = server.get('hostname', '')
192-
for service in cast(List[ServiceInfoT], server.get('services', [])):
193-
if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id():
194-
continue
195-
port = self.mgr.get_module_option_ex(
196-
'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
197-
targets.append(f'{hostname}:{port}')
190+
mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr')
191+
host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or ''
192+
fqdn = self.mgr.get_fqdn(host)
193+
port = self.mgr.get_module_option_ex(
194+
'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
195+
targets.append(f'{fqdn}:{port}')
198196
return [{"targets": targets, "labels": {}}]
199197

200198
def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]:

src/pybind/mgr/cephadm/tests/test_service_discovery.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55

66
class FakeDaemonDescription:
7-
def __init__(self, ip, ports, hostname, service_name='', daemon_type=''):
7+
def __init__(self, ip, ports, hostname, service_name='', daemon_type='', daemon_id=''):
88
self.ip = ip
99
self.ports = ports
1010
self.hostname = hostname
1111
self._service_name = service_name
1212
self.daemon_type = daemon_type
13+
self.daemon_id = daemon_id if daemon_id else hostname
1314

1415
def service_name(self):
1516
return self._service_name
@@ -36,6 +37,10 @@ def get_daemons_by_service(self, service_type):
3637
return [FakeDaemonDescription('1.2.3.4', [9123], 'node0'),
3738
FakeDaemonDescription('1.2.3.5', [9123], 'node1')]
3839

40+
if service_type == 'mgr':
41+
return [FakeDaemonDescription('1.2.3.4', [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'),
42+
FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr')]
43+
3944
return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
4045
FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
4146

@@ -111,19 +116,6 @@ def __init__(self):
111116
def get_mgr_id(self):
112117
return 'mgr-1'
113118

114-
def list_servers(self):
115-
116-
servers = [
117-
{'hostname': 'node0',
118-
'ceph_version': '16.2',
119-
'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]},
120-
{'hostname': 'node1',
121-
'ceph_version': '16.2',
122-
'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]}
123-
]
124-
125-
return servers
126-
127119
def _check_mon_command(self, cmd_dict, inbuf=None):
128120
prefix = cmd_dict.get('prefix')
129121
if prefix == 'get-cmd':
@@ -136,6 +128,14 @@ def _check_mon_command(self, cmd_dict, inbuf=None):
136128
def get_module_option_ex(self, module, option, default_value):
137129
return "9283"
138130

131+
def daemon_is_self(self, d_type, d_id) -> bool:
132+
if d_type == 'mgr' and d_id == 'fake_active_mgr':
133+
return True
134+
return False
135+
136+
def get_fqdn(self, hostname: str) -> str:
137+
return hostname
138+
139139

140140
class TestServiceDiscovery:
141141

0 commit comments

Comments
 (0)