Skip to content

Commit 2cbd2cd

Browse files
authored
Merge pull request ceph#61468 from Nordix/fix-69610-sunnat
mgr/prometheus: Make prometheus TLS config work with Rook orchestrator Reviewed-by: Redouane Kachach <[email protected]>
2 parents e2ee420 + 64f590c commit 2cbd2cd

File tree

2 files changed

+119
-24
lines changed

2 files changed

+119
-24
lines changed

src/pybind/mgr/prometheus/module.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import time
1010
import enum
1111
from collections import namedtuple
12-
import tempfile
12+
from tempfile import NamedTemporaryFile
1313

1414
from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
1515
from mgr_util import get_default_addr, profile_method, build_url
@@ -1761,24 +1761,17 @@ def self_test(self) -> None:
17611761
self.get_file_sd_config()
17621762

17631763
def configure(self, server_addr: str, server_port: int) -> None:
1764-
# TODO(redo): this new check is hacky, we should provide an explit cmd
1765-
# from cephadm to get/check the security status
1766-
1767-
# if cephadm is configured with security then TLS must be used
1768-
cmd = {'prefix': 'orch prometheus get-credentials'}
1764+
cmd = {'prefix': 'orch get-security-config'}
17691765
ret, out, _ = self.mon_command(cmd)
17701766
if ret == 0 and out is not None:
1771-
access_info = json.loads(out)
1772-
if access_info:
1773-
try:
1774-
self.setup_tls_using_cephadm(server_addr, server_port)
1767+
try:
1768+
security_config = json.loads(out)
1769+
if security_config.get('security_enabled', False):
1770+
self.setup_tls_config(server_addr, server_port)
17751771
return
1776-
except Exception as e:
1777-
self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n',
1778-
'Falling back to default configuration')
1779-
1780-
# In any error fallback to plain http mode
1781-
self.setup_default_config(server_addr, server_port)
1772+
except Exception as e:
1773+
self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n',
1774+
'Falling back to default configuration')
17821775

17831776
def setup_default_config(self, server_addr: str, server_port: int) -> None:
17841777
cherrypy.config.update({
@@ -1793,8 +1786,10 @@ def setup_default_config(self, server_addr: str, server_port: int) -> None:
17931786
self.set_uri(build_url(scheme='http', host=self.get_server_addr(),
17941787
port=server_port, path='/'))
17951788

1796-
def setup_tls_using_cephadm(self, server_addr: str, server_port: int) -> None:
1797-
from mgr_util import verify_tls_files
1789+
def setup_tls_config(self, server_addr: str, server_port: int) -> None:
1790+
# Temporarily disabling the verify function due to issues.
1791+
# Please check verify_tls_files below to more information.
1792+
# from mgr_util import verify_tls_files
17981793
cmd = {'prefix': 'orch certmgr generate-certificates',
17991794
'module_name': 'prometheus',
18001795
'format': 'json'}
@@ -1807,14 +1802,17 @@ def setup_tls_using_cephadm(self, server_addr: str, server_port: int) -> None:
18071802
return
18081803

18091804
cert_key = json.loads(out)
1810-
self.cert_file = tempfile.NamedTemporaryFile()
1805+
self.cert_file = NamedTemporaryFile()
18111806
self.cert_file.write(cert_key['cert'].encode('utf-8'))
18121807
self.cert_file.flush() # cert_tmp must not be gc'ed
1813-
self.key_file = tempfile.NamedTemporaryFile()
1808+
self.key_file = NamedTemporaryFile()
18141809
self.key_file.write(cert_key['key'].encode('utf-8'))
18151810
self.key_file.flush() # pkey_tmp must not be gc'ed
18161811

1817-
verify_tls_files(self.cert_file.name, self.key_file.name)
1812+
# Temporarily disabling the verify function due to issues:
1813+
# See https://github.com/pyca/bcrypt/issues/694 for details.
1814+
# Re-enable once the issue is resolved.
1815+
# verify_tls_files(self.cert_file.name, self.key_file.name)
18181816
cert_file_path, key_file_path = self.cert_file.name, self.key_file.name
18191817

18201818
cherrypy.config.update({

src/pybind/mgr/rook/module.py

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,16 @@
55
import functools
66
import os
77
import json
8+
import base64
9+
import time
10+
from typing import Optional, Dict, Union, Tuple, Type, Optional
11+
from functools import wraps
812

913
from ceph.deployment import inventory
1014
from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec, RGWSpec, PlacementSpec
1115
from ceph.utils import datetime_now
1216

13-
from typing import List, Dict, Optional, Callable, Any, TypeVar, Tuple, TYPE_CHECKING
17+
from typing import List, Dict, Optional, Callable, Any, TypeVar, Tuple, TYPE_CHECKING, cast
1418

1519
try:
1620
from ceph.deployment.drive_group import DriveGroupSpec
@@ -19,7 +23,7 @@
1923

2024
try:
2125
from kubernetes import client, config
22-
from kubernetes.client.rest import ApiException
26+
from kubernetes.client import ApiException, CoreV1Api, V1Secret
2327

2428
kubernetes_imported = True
2529

@@ -33,6 +37,9 @@ def names(self: Any, names: Any) -> None:
3337
kubernetes_imported = False
3438
client = None
3539
config = None
40+
ApiException = Exception
41+
CoreV1Api = None
42+
V1Secret = object
3643

3744
from mgr_module import MgrModule, Option, NFS_POOL_NAME
3845
import orchestrator
@@ -44,6 +51,34 @@ def names(self: Any, names: Any) -> None:
4451
FuncT = TypeVar('FuncT', bound=Callable)
4552
ServiceSpecT = TypeVar('ServiceSpecT', bound=ServiceSpec)
4653

54+
def retry(
55+
on_exception: Union[Type[Exception], Tuple[Type[Exception], ...]],
56+
tries: int = 3,
57+
delay: int = 1,
58+
backoff: int = 2,
59+
max_delay: int = 60,
60+
logger: Optional[logging.Logger] = None,
61+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
62+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
63+
@wraps(func)
64+
def wrapper(*args: Any, **kwargs: Any) -> Any:
65+
wait = delay
66+
err: Optional[Exception] = None
67+
for i in range(tries):
68+
try:
69+
return func(*args, **kwargs)
70+
except on_exception as e:
71+
err = e
72+
if logger:
73+
logger.warning(
74+
f"Retry #{i+1}/{tries} after exception in '{func.__name__}': {e}"
75+
)
76+
if i < tries - 1:
77+
time.sleep(min(wait, max_delay))
78+
wait *= backoff
79+
raise err # type: ignore
80+
return wrapper
81+
return decorator
4782

4883
class RookEnv(object):
4984
def __init__(self) -> None:
@@ -82,6 +117,18 @@ class RookOrchestrator(MgrModule, orchestrator.Orchestrator):
82117
default='local',
83118
desc='storage class name for LSO-discovered PVs',
84119
),
120+
Option(
121+
'secure_monitoring_stack',
122+
type='bool',
123+
default=False,
124+
desc='Enable TLS security for all the monitoring stack daemons'
125+
),
126+
Option(
127+
'prometheus_tls_secret_name',
128+
type='str',
129+
default='rook-ceph-prometheus-server-tls',
130+
desc='name of tls secret in k8s for prometheus',
131+
)
85132
]
86133

87134
@staticmethod
@@ -531,6 +578,16 @@ def _get_pool_params(self) -> Tuple[int, str]:
531578
break
532579
return num_replicas, leaf_type
533580

581+
@handle_orch_error
582+
def get_security_config(self) -> Dict[str, bool]:
583+
secure_monitoring_stack = cast(
584+
bool, self.get_module_option_ex('rook', 'secure_monitoring_stack', False)
585+
)
586+
return {
587+
'security_enabled': secure_monitoring_stack,
588+
'mgmt_gw_enabled': False
589+
}
590+
534591
@handle_orch_error
535592
def remove_service(self, service_name: str, force: bool = False) -> str:
536593
if service_name == 'rbd-mirror':
@@ -567,7 +624,7 @@ def zap_device(self, host: str, path: str) -> OrchResult[str]:
567624
except Exception as e:
568625
logging.error(e)
569626
return OrchResult(None, Exception("Unable to zap device: " + str(e.with_traceback(None))))
570-
return OrchResult(f'{path} on {host} zapped')
627+
return OrchResult(f'{path} on {host} zapped')
571628

572629
@handle_orch_error
573630
def apply_mon(self, spec):
@@ -639,3 +696,43 @@ def upgrade_status(self) -> orchestrator.UpgradeStatusSpec:
639696
@handle_orch_error
640697
def upgrade_ls(self, image: Optional[str], tags: bool, show_all_versions: Optional[bool]) -> Dict[Any, Any]:
641698
return {}
699+
700+
# Retry decorator for handling transient Kubernetes API failures
701+
@retry(on_exception=ApiException, tries=7, delay=1, backoff=2, max_delay=60)
702+
def fetch_k8s_secret(self, secret_name: str) -> Optional[V1Secret]:
703+
if self._k8s_CoreV1_api is None:
704+
logging.warning("CoreV1Api client is not initialized, returning None.")
705+
return None
706+
707+
try:
708+
return self._k8s_CoreV1_api.read_namespaced_secret(
709+
name=secret_name,
710+
namespace=self._rook_env.namespace
711+
)
712+
except Exception as e:
713+
logging.warning(f"Failed to fetch secret '{secret_name}': {e}")
714+
return None
715+
716+
@handle_orch_error
717+
def generate_certificates(self, module_name: str) -> Optional[Dict[str, str]]:
718+
api_response = None
719+
cert, key = "", ""
720+
supported_modules = ['prometheus']
721+
if module_name not in supported_modules:
722+
raise orchestrator.OrchestratorError(f'Unsupported module {module_name}. Supported module are: {supported_modules}')
723+
724+
secret_name = self.get_module_option(f'{module_name}_tls_secret_name')
725+
try:
726+
api_response = self.fetch_k8s_secret(secret_name)
727+
except ApiException as e:
728+
raise orchestrator.OrchestratorError(f'Unable to get certificates for {module_name}, error: {e}')
729+
730+
if api_response is None:
731+
raise orchestrator.OrchestratorError(f'Unable to get certificates for {module_name}')
732+
else:
733+
cert = base64.b64decode(api_response.data.get('tls.crt','')).decode('utf-8')
734+
key = base64.b64decode(api_response.data.get('tls.key', '')).decode('utf-8')
735+
if cert == "" or key == "":
736+
raise orchestrator.OrchestratorError(f'Unable to parse certificates for {module_name} module')
737+
738+
return {'cert': cert, 'key': key}

0 commit comments

Comments
 (0)