Skip to content

Commit 57ed40f

Browse files
authored
Fix unhandled DCSError during startup phase (patroni#3149)
Ensure DCS connectivity before we check node uniqueness or load dynamic configuration.
1 parent d5d6a51 commit 57ed40f

File tree

2 files changed

+69
-43
lines changed

2 files changed

+69
-43
lines changed

patroni/__main__.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
if TYPE_CHECKING: # pragma: no cover
2020
from .config import Config
21+
from .dcs import Cluster
2122

2223
logger = logging.getLogger(__name__)
2324

@@ -63,10 +64,11 @@ def __init__(self, config: 'Config') -> None:
6364
self.dcs = get_dcs(self.config)
6465
self.request = PatroniRequest(self.config, True)
6566

66-
self.ensure_unique_name()
67+
cluster = self.ensure_dcs_access()
68+
self.ensure_unique_name(cluster)
6769

6870
self.watchdog = Watchdog(self.config)
69-
self.load_dynamic_configuration()
71+
self.apply_dynamic_configuration(cluster)
7072

7173
self.postgresql = Postgresql(self.config['postgresql'], self.dcs.mpp)
7274
self.api = RestApiServer(self, self.config['restapi'])
@@ -76,40 +78,49 @@ def __init__(self, config: 'Config') -> None:
7678
self.next_run = time.time()
7779
self.scheduled_restart: Dict[str, Any] = {}
7880

79-
def load_dynamic_configuration(self) -> None:
80-
"""Load Patroni dynamic configuration.
81+
def ensure_dcs_access(self, sleep_time: int = 5) -> 'Cluster':
82+
"""Continuously attempt to retrieve cluster from DCS with delay.
8183
82-
Load dynamic configuration from the DCS, if `/config` key is available in the DCS, otherwise fall back to
83-
``bootstrap.dcs`` section from the configuration file.
84-
85-
If the DCS connection fails returning the exception :class:`~patroni.exceptions.DCSError` an attempt will be
86-
remade every 5 seconds.
84+
:param sleep_time: seconds to wait between retry attempts after dcs connection raise :exc:`DCSError`.
8785
88-
.. note::
89-
This method is called only once, at the time when Patroni is started.
86+
:returns: a PostgreSQL or MPP implementation of :class:`Cluster`.
9087
"""
9188
from patroni.exceptions import DCSError
89+
9290
while True:
9391
try:
94-
cluster = self.dcs.get_cluster()
95-
if cluster and cluster.config and cluster.config.data:
96-
if self.config.set_dynamic_configuration(cluster.config):
97-
self.dcs.reload_config(self.config)
98-
self.watchdog.reload_config(self.config)
99-
elif not self.config.dynamic_configuration and 'bootstrap' in self.config:
100-
if self.config.set_dynamic_configuration(self.config['bootstrap']['dcs']):
101-
self.dcs.reload_config(self.config)
102-
self.watchdog.reload_config(self.config)
103-
break
92+
return self.dcs.get_cluster()
10493
except DCSError:
10594
logger.warning('Can not get cluster from dcs')
106-
time.sleep(5)
95+
time.sleep(sleep_time)
96+
97+
def apply_dynamic_configuration(self, cluster: 'Cluster') -> None:
98+
"""Apply Patroni dynamic configuration.
10799
108-
def ensure_unique_name(self) -> None:
109-
"""A helper method to prevent splitbrain from operator naming error."""
100+
Apply dynamic configuration from the DCS, if `/config` key is available in the DCS, otherwise fall back to
101+
``bootstrap.dcs`` section from the configuration file.
102+
103+
.. note::
104+
This method is called only once, at the time when Patroni is started.
105+
106+
:param cluster: a PostgreSQL or MPP implementation of :class:`Cluster`.
107+
"""
108+
if cluster and cluster.config and cluster.config.data:
109+
if self.config.set_dynamic_configuration(cluster.config):
110+
self.dcs.reload_config(self.config)
111+
self.watchdog.reload_config(self.config)
112+
elif not self.config.dynamic_configuration and 'bootstrap' in self.config:
113+
if self.config.set_dynamic_configuration(self.config['bootstrap']['dcs']):
114+
self.dcs.reload_config(self.config)
115+
self.watchdog.reload_config(self.config)
116+
117+
def ensure_unique_name(self, cluster: 'Cluster') -> None:
118+
"""A helper method to prevent splitbrain from operator naming error.
119+
120+
:param cluster: a PostgreSQL or MPP implementation of :class:`Cluster`.
121+
"""
110122
from patroni.dcs import Member
111123

112-
cluster = self.dcs.get_cluster()
113124
if not cluster:
114125
return
115126
member = cluster.get_member(self.config['name'], False)

tests/test_patroni.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from patroni.__main__ import check_psycopg, main as _main, Patroni
1616
from patroni.api import RestApiServer
1717
from patroni.async_executor import AsyncExecutor
18-
from patroni.dcs import Cluster, Member
18+
from patroni.dcs import Cluster, ClusterConfig, Member
1919
from patroni.dcs.etcd import AbstractEtcdClientWithFailover
2020
from patroni.exceptions import DCSError
2121
from patroni.postgresql import Postgresql
@@ -90,11 +90,22 @@ def setUp(self):
9090
def tearDown(self):
9191
logging.getLogger().handlers[:] = self._handlers
9292

93-
@patch('patroni.dcs.AbstractDCS.get_cluster', Mock(side_effect=[None, DCSError('foo'), None]))
94-
def test_load_dynamic_configuration(self):
93+
def test_apply_dynamic_configuration(self):
94+
empty_cluster = Cluster.empty()
95+
self.p.config._dynamic_configuration = {}
96+
self.p.apply_dynamic_configuration(empty_cluster)
97+
self.assertEqual(self.p.config._dynamic_configuration['ttl'], 30)
98+
99+
without_config = empty_cluster._asdict()
100+
del without_config['config']
101+
cluster = Cluster(
102+
config=ClusterConfig(version=1, modify_version=1, data={"ttl": 40}),
103+
**without_config
104+
)
95105
self.p.config._dynamic_configuration = {}
96-
self.p.load_dynamic_configuration()
97-
self.p.load_dynamic_configuration()
106+
self.p.apply_dynamic_configuration(cluster)
107+
self.assertEqual(self.p.config._dynamic_configuration['ttl'], 40)
108+
98109

99110
@patch('sys.argv', ['patroni.py', 'postgres0.yml'])
100111
@patch('time.sleep', Mock(side_effect=SleepException))
@@ -276,11 +287,9 @@ def test_check_psycopg(self):
276287

277288
def test_ensure_unique_name(self):
278289
# None/empty cluster implies unique name
279-
with patch('patroni.dcs.AbstractDCS.get_cluster', Mock(return_value=None)):
280-
self.assertIsNone(self.p.ensure_unique_name())
290+
self.assertIsNone(self.p.ensure_unique_name(None))
281291
empty_cluster = Cluster.empty()
282-
with patch('patroni.dcs.AbstractDCS.get_cluster', Mock(return_value=empty_cluster)):
283-
self.assertIsNone(self.p.ensure_unique_name())
292+
self.assertIsNone(self.p.ensure_unique_name(empty_cluster))
284293
without_members = empty_cluster._asdict()
285294
del without_members['members']
286295

@@ -289,8 +298,7 @@ def test_ensure_unique_name(self):
289298
members=[Member(version=1, name="distinct", session=1, data={})],
290299
**without_members
291300
)
292-
with patch('patroni.dcs.AbstractDCS.get_cluster', Mock(return_value=okay_cluster)):
293-
self.assertIsNone(self.p.ensure_unique_name())
301+
self.assertIsNone(self.p.ensure_unique_name(okay_cluster))
294302

295303
# Cluster with a member with the same name that is running
296304
bad_cluster = Cluster(
@@ -299,10 +307,17 @@ def test_ensure_unique_name(self):
299307
})],
300308
**without_members
301309
)
302-
with patch('patroni.dcs.AbstractDCS.get_cluster', Mock(return_value=bad_cluster)):
303-
# If the api of the running node cannot be reached, this implies unique name
304-
with patch('urllib3.PoolManager.request', Mock(side_effect=ConnectionError)):
305-
self.assertIsNone(self.p.ensure_unique_name())
306-
# Only if the api of the running node is reachable do we throw an error
307-
with patch('urllib3.PoolManager.request', Mock()):
308-
self.assertRaises(SystemExit, self.p.ensure_unique_name)
310+
# If the api of the running node cannot be reached, this implies unique name
311+
with patch('urllib3.PoolManager.request', Mock(side_effect=ConnectionError)):
312+
self.assertIsNone(self.p.ensure_unique_name(bad_cluster))
313+
# Only if the api of the running node is reachable do we throw an error
314+
with patch('urllib3.PoolManager.request', Mock()):
315+
self.assertRaises(SystemExit, self.p.ensure_unique_name, bad_cluster)
316+
317+
@patch('patroni.dcs.AbstractDCS.get_cluster', Mock(side_effect=[DCSError('foo'), DCSError('foo'), None]))
318+
def test_ensure_dcs_access(self):
319+
with patch('patroni.__main__.logger.warning') as mock_logger:
320+
result = self.p.ensure_dcs_access()
321+
self.assertEqual(result, None)
322+
self.assertEqual(mock_logger.call_count, 2)
323+

0 commit comments

Comments
 (0)