Skip to content

Commit 0567d36

Browse files
authored
Avoid running demote('offline') concurrently (patroni#3372)
In case of slow shutdown it could happen that the next heartbeat loop will hit _handle_dcs_error() method one more time, what resulted in `AsyncExecutor is busy, demoting from the main thread` warning and starting demote('offline') one more time.
1 parent a79a69a commit 0567d36

File tree

2 files changed

+3
-8
lines changed

2 files changed

+3
-8
lines changed

patroni/ha.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2283,10 +2283,7 @@ def _handle_dcs_error(self) -> str:
22832283
self._sync_replication_slots(True)
22842284
return 'continue to run as a leader because failsafe mode is enabled and all members are accessible'
22852285
self._failsafe.set_is_active(0)
2286-
msg = 'demoting self because DCS is not accessible and I was a leader'
2287-
if not self._async_executor.try_run_async(msg, self.demote, ('offline',)):
2288-
return msg
2289-
logger.warning('AsyncExecutor is busy, demoting from the main thread')
2286+
logger.info('demoting self because DCS is not accessible and I was a leader')
22902287
self.demote('offline')
22912288
return 'demoted self because DCS is not accessible and I was a leader'
22922289
else:

tests/test_ha.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -513,24 +513,22 @@ def test_follow_triggers_rewind(self):
513513

514514
def test_no_dcs_connection_primary_demote(self):
515515
self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly'))
516-
self.assertEqual(self.ha.run_cycle(), 'demoting self because DCS is not accessible and I was a leader')
517-
self.ha._async_executor.schedule('dummy')
518516
self.assertEqual(self.ha.run_cycle(), 'demoted self because DCS is not accessible and I was a leader')
519517

520518
def test_check_failsafe_topology(self):
521519
self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly'))
522520
self.ha.cluster = get_cluster_initialized_with_leader_and_failsafe()
523521
global_config.update(self.ha.cluster)
524522
self.ha.dcs._last_failsafe = self.ha.cluster.failsafe
525-
self.assertEqual(self.ha.run_cycle(), 'demoting self because DCS is not accessible and I was a leader')
523+
self.assertEqual(self.ha.run_cycle(), 'demoted self because DCS is not accessible and I was a leader')
526524
self.ha.state_handler.name = self.ha.cluster.leader.name
527525
self.assertFalse(self.ha.failsafe_is_active())
528526
self.assertEqual(self.ha.run_cycle(),
529527
'continue to run as a leader because failsafe mode is enabled and all members are accessible')
530528
self.assertTrue(self.ha.failsafe_is_active())
531529
with patch.object(Postgresql, 'slots', Mock(side_effect=Exception)):
532530
self.ha.patroni.request = Mock(side_effect=Exception)
533-
self.assertEqual(self.ha.run_cycle(), 'demoting self because DCS is not accessible and I was a leader')
531+
self.assertEqual(self.ha.run_cycle(), 'demoted self because DCS is not accessible and I was a leader')
534532
self.assertFalse(self.ha.failsafe_is_active())
535533
self.ha.dcs._last_failsafe.clear()
536534
self.ha.dcs._last_failsafe[self.ha.cluster.leader.name] = self.ha.cluster.leader.member.api_url

0 commit comments

Comments
 (0)