Skip to content

Commit 7dcb9b9

Browse files
Run on_role_change cb after a failed primary recovery (patroni#3198)
Additionally run on_role_change callback in post_recover() for a primary that failed to start after a crash to increase chances the callback is executed, even if the further start as a replica fails --------- Co-authored-by: Alexander Kukushkin <[email protected]>
1 parent e8a8bfe commit 7dcb9b9

File tree

4 files changed

+24
-1
lines changed

4 files changed

+24
-1
lines changed

features/recovery.feature

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,12 @@ Feature: recovery
2424
When I kill postmaster on postgres-0
2525
Then postgres-1 is a leader after 10 seconds
2626
And postgres-1 role is the primary after 10 seconds
27+
28+
Scenario: check crashed primary demotes after failed attempt to start
29+
Given I issue a PATCH request to http://127.0.0.1:8009/config with {"master_start_timeout": null}
30+
Then I receive a response code 200
31+
And postgres-0 role is the replica after 10 seconds
32+
When I ensure postgres-1 fails to start after a failure
33+
When I kill postmaster on postgres-1
34+
Then postgres-0 is a leader after 10 seconds
35+
And there is a postgres-1_cb.log with "on_role_change demoted batman" in postgres-1 data directory

features/steps/recovery.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import os
2+
3+
from behave import step
4+
5+
6+
@step('I ensure {name:name} fails to start after a failure')
7+
def spoil_autoconf(context, name):
8+
with open(os.path.join(context.pctl._processes[name]._data_dir, 'postgresql.auto.conf'), 'w') as f:
9+
f.write('foo=bar')

patroni/ha.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,7 @@ def post_recover(self) -> Optional[str]:
19641964
if self.has_lock():
19651965
if self.state_handler.role in ('primary', 'standby_leader'):
19661966
self.state_handler.set_role('demoted')
1967+
self.state_handler.call_nowait(CallbackAction.ON_ROLE_CHANGE)
19671968
self._delete_leader()
19681969
return 'removed leader key after trying and failing to start postgres'
19691970
return 'failed to start postgres'

tests/test_ha.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from patroni.ha import _MemberStatus, Ha
1616
from patroni.postgresql import Postgresql
1717
from patroni.postgresql.bootstrap import Bootstrap
18+
from patroni.postgresql.callback_executor import CallbackAction
1819
from patroni.postgresql.cancellable import CancellableSubprocess
1920
from patroni.postgresql.config import ConfigHandler
2021
from patroni.postgresql.postmaster import PostmasterProcess
@@ -1105,11 +1106,14 @@ def test_fetch_node_status(self):
11051106
@patch.object(Rewind, 'check_leader_is_not_in_recovery', true)
11061107
@patch('os.listdir', Mock(return_value=[]))
11071108
@patch('patroni.postgresql.rewind.fsync_dir', Mock())
1108-
def test_post_recover(self):
1109+
@patch.object(Postgresql, 'call_nowait')
1110+
def test_post_recover(self, mock_call_nowait):
11091111
self.p.is_running = false
11101112
self.ha.has_lock = true
11111113
self.p.set_role('primary')
11121114
self.assertEqual(self.ha.post_recover(), 'removed leader key after trying and failing to start postgres')
1115+
self.assertEqual(self.p.role, 'demoted')
1116+
mock_call_nowait.assert_called_once_with(CallbackAction.ON_ROLE_CHANGE)
11131117
self.ha.has_lock = false
11141118
self.assertEqual(self.ha.post_recover(), 'failed to start postgres')
11151119
leader = Leader(0, 0, Member(0, 'l', 2, {"version": "1.6", "conn_url": "postgres://a", "role": "primary"}))

0 commit comments

Comments
 (0)