Skip to content

Commit 64f590c

Browse files
committed
mgr/prometheus: extend tls config to work with Rook orch
This commit extends tls to work with Rook orch that has been deployed for cephadm. Certificates are read from rook namespace via kubernetes api client. Name of the secrets are provided by following parameter: prometheus_tls_secret_name (default secret name "rook-ceph-prometheus-server-tls") When cephadm is used it generates silf signed certificates, when rook used it reads certififcates from kubernetes client api in rook_env.namespace. Signed-off-by: Sunnatillo <[email protected]>
1 parent bda2ad5 commit 64f590c

File tree

2 files changed

+109
-12
lines changed

2 files changed

+109
-12
lines changed

src/pybind/mgr/prometheus/module.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import time
1010
import enum
1111
from collections import namedtuple
12-
import tempfile
12+
from tempfile import NamedTemporaryFile
1313

1414
from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
1515
from mgr_util import get_default_addr, profile_method, build_url
@@ -1767,15 +1767,12 @@ def configure(self, server_addr: str, server_port: int) -> None:
17671767
try:
17681768
security_config = json.loads(out)
17691769
if security_config.get('security_enabled', False):
1770-
self.setup_tls_using_cephadm(server_addr, server_port)
1770+
self.setup_tls_config(server_addr, server_port)
17711771
return
17721772
except Exception as e:
17731773
self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n',
17741774
'Falling back to default configuration')
17751775

1776-
# In any error fallback to plain http mode
1777-
self.setup_default_config(server_addr, server_port)
1778-
17791776
def setup_default_config(self, server_addr: str, server_port: int) -> None:
17801777
cherrypy.config.update({
17811778
'server.socket_host': server_addr,
@@ -1789,7 +1786,10 @@ def setup_default_config(self, server_addr: str, server_port: int) -> None:
17891786
self.set_uri(build_url(scheme='http', host=self.get_server_addr(),
17901787
port=server_port, path='/'))
17911788

1792-
def setup_tls_using_cephadm(self, server_addr: str, server_port: int) -> None:
1789+
def setup_tls_config(self, server_addr: str, server_port: int) -> None:
1790+
# Temporarily disabling the verify function due to issues.
1791+
# Please check verify_tls_files below to more information.
1792+
# from mgr_util import verify_tls_files
17931793
cmd = {'prefix': 'orch certmgr generate-certificates',
17941794
'module_name': 'prometheus',
17951795
'format': 'json'}
@@ -1802,13 +1802,17 @@ def setup_tls_using_cephadm(self, server_addr: str, server_port: int) -> None:
18021802
return
18031803

18041804
cert_key = json.loads(out)
1805-
self.cert_file = tempfile.NamedTemporaryFile()
1805+
self.cert_file = NamedTemporaryFile()
18061806
self.cert_file.write(cert_key['cert'].encode('utf-8'))
18071807
self.cert_file.flush() # cert_tmp must not be gc'ed
1808-
self.key_file = tempfile.NamedTemporaryFile()
1808+
self.key_file = NamedTemporaryFile()
18091809
self.key_file.write(cert_key['key'].encode('utf-8'))
18101810
self.key_file.flush() # pkey_tmp must not be gc'ed
18111811

1812+
# Temporarily disabling the verify function due to issues:
1813+
# See https://github.com/pyca/bcrypt/issues/694 for details.
1814+
# Re-enable once the issue is resolved.
1815+
# verify_tls_files(self.cert_file.name, self.key_file.name)
18121816
cert_file_path, key_file_path = self.cert_file.name, self.key_file.name
18131817

18141818
cherrypy.config.update({

src/pybind/mgr/rook/module.py

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,16 @@
55
import functools
66
import os
77
import json
8+
import base64
9+
import time
10+
from typing import Optional, Dict, Union, Tuple, Type, Optional
11+
from functools import wraps
812

913
from ceph.deployment import inventory
1014
from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec, RGWSpec, PlacementSpec
1115
from ceph.utils import datetime_now
1216

13-
from typing import List, Dict, Optional, Callable, Any, TypeVar, Tuple, TYPE_CHECKING
17+
from typing import List, Dict, Optional, Callable, Any, TypeVar, Tuple, TYPE_CHECKING, cast
1418

1519
try:
1620
from ceph.deployment.drive_group import DriveGroupSpec
@@ -19,7 +23,7 @@
1923

2024
try:
2125
from kubernetes import client, config
22-
from kubernetes.client.rest import ApiException
26+
from kubernetes.client import ApiException, CoreV1Api, V1Secret
2327

2428
kubernetes_imported = True
2529

@@ -33,6 +37,9 @@ def names(self: Any, names: Any) -> None:
3337
kubernetes_imported = False
3438
client = None
3539
config = None
40+
ApiException = Exception
41+
CoreV1Api = None
42+
V1Secret = object
3643

3744
from mgr_module import MgrModule, Option, NFS_POOL_NAME
3845
import orchestrator
@@ -44,6 +51,34 @@ def names(self: Any, names: Any) -> None:
4451
FuncT = TypeVar('FuncT', bound=Callable)
4552
ServiceSpecT = TypeVar('ServiceSpecT', bound=ServiceSpec)
4653

54+
def retry(
55+
on_exception: Union[Type[Exception], Tuple[Type[Exception], ...]],
56+
tries: int = 3,
57+
delay: int = 1,
58+
backoff: int = 2,
59+
max_delay: int = 60,
60+
logger: Optional[logging.Logger] = None,
61+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
62+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
63+
@wraps(func)
64+
def wrapper(*args: Any, **kwargs: Any) -> Any:
65+
wait = delay
66+
err: Optional[Exception] = None
67+
for i in range(tries):
68+
try:
69+
return func(*args, **kwargs)
70+
except on_exception as e:
71+
err = e
72+
if logger:
73+
logger.warning(
74+
f"Retry #{i+1}/{tries} after exception in '{func.__name__}': {e}"
75+
)
76+
if i < tries - 1:
77+
time.sleep(min(wait, max_delay))
78+
wait *= backoff
79+
raise err # type: ignore
80+
return wrapper
81+
return decorator
4782

4883
class RookEnv(object):
4984
def __init__(self) -> None:
@@ -82,6 +117,18 @@ class RookOrchestrator(MgrModule, orchestrator.Orchestrator):
82117
default='local',
83118
desc='storage class name for LSO-discovered PVs',
84119
),
120+
Option(
121+
'secure_monitoring_stack',
122+
type='bool',
123+
default=False,
124+
desc='Enable TLS security for all the monitoring stack daemons'
125+
),
126+
Option(
127+
'prometheus_tls_secret_name',
128+
type='str',
129+
default='rook-ceph-prometheus-server-tls',
130+
desc='name of tls secret in k8s for prometheus',
131+
)
85132
]
86133

87134
@staticmethod
@@ -533,7 +580,13 @@ def _get_pool_params(self) -> Tuple[int, str]:
533580

534581
@handle_orch_error
535582
def get_security_config(self) -> Dict[str, bool]:
536-
return {}
583+
secure_monitoring_stack = cast(
584+
bool, self.get_module_option_ex('rook', 'secure_monitoring_stack', False)
585+
)
586+
return {
587+
'security_enabled': secure_monitoring_stack,
588+
'mgmt_gw_enabled': False
589+
}
537590

538591
@handle_orch_error
539592
def remove_service(self, service_name: str, force: bool = False) -> str:
@@ -571,7 +624,7 @@ def zap_device(self, host: str, path: str) -> OrchResult[str]:
571624
except Exception as e:
572625
logging.error(e)
573626
return OrchResult(None, Exception("Unable to zap device: " + str(e.with_traceback(None))))
574-
return OrchResult(f'{path} on {host} zapped')
627+
return OrchResult(f'{path} on {host} zapped')
575628

576629
@handle_orch_error
577630
def apply_mon(self, spec):
@@ -643,3 +696,43 @@ def upgrade_status(self) -> orchestrator.UpgradeStatusSpec:
643696
@handle_orch_error
644697
def upgrade_ls(self, image: Optional[str], tags: bool, show_all_versions: Optional[bool]) -> Dict[Any, Any]:
645698
return {}
699+
700+
# Retry decorator for handling transient Kubernetes API failures
701+
@retry(on_exception=ApiException, tries=7, delay=1, backoff=2, max_delay=60)
702+
def fetch_k8s_secret(self, secret_name: str) -> Optional[V1Secret]:
703+
if self._k8s_CoreV1_api is None:
704+
logging.warning("CoreV1Api client is not initialized, returning None.")
705+
return None
706+
707+
try:
708+
return self._k8s_CoreV1_api.read_namespaced_secret(
709+
name=secret_name,
710+
namespace=self._rook_env.namespace
711+
)
712+
except Exception as e:
713+
logging.warning(f"Failed to fetch secret '{secret_name}': {e}")
714+
return None
715+
716+
@handle_orch_error
717+
def generate_certificates(self, module_name: str) -> Optional[Dict[str, str]]:
718+
api_response = None
719+
cert, key = "", ""
720+
supported_modules = ['prometheus']
721+
if module_name not in supported_modules:
722+
raise orchestrator.OrchestratorError(f'Unsupported module {module_name}. Supported module are: {supported_modules}')
723+
724+
secret_name = self.get_module_option(f'{module_name}_tls_secret_name')
725+
try:
726+
api_response = self.fetch_k8s_secret(secret_name)
727+
except ApiException as e:
728+
raise orchestrator.OrchestratorError(f'Unable to get certificates for {module_name}, error: {e}')
729+
730+
if api_response is None:
731+
raise orchestrator.OrchestratorError(f'Unable to get certificates for {module_name}')
732+
else:
733+
cert = base64.b64decode(api_response.data.get('tls.crt','')).decode('utf-8')
734+
key = base64.b64decode(api_response.data.get('tls.key', '')).decode('utf-8')
735+
if cert == "" or key == "":
736+
raise orchestrator.OrchestratorError(f'Unable to parse certificates for {module_name} module')
737+
738+
return {'cert': cert, 'key': key}

0 commit comments

Comments
 (0)