88
99from multiprocessing .pool import ThreadPool
1010from threading import RLock
11- from typing import Any , Callable , Collection , Dict , List , NamedTuple , Optional , Tuple , TYPE_CHECKING , Union
11+ from typing import Any , Callable , cast , Collection , Dict , List , NamedTuple , Optional , Tuple , TYPE_CHECKING , Union
1212
1313from . import global_config , psycopg
1414from .__main__ import Patroni
@@ -746,7 +746,10 @@ def follow(self, demote_reason: str, follow_reason: str, refresh: bool = True) -
746746 if not node_to_follow :
747747 return 'no action. I am ({0})' .format (self .state_handler .name )
748748 elif is_leader :
749- self .demote ('immediate-nolock' )
749+ if self .is_standby_cluster ():
750+ self ._async_executor .try_run_async ('demoting to a standby cluster' , self .demote , ('demote-cluster' ,))
751+ else :
752+ self .demote ('immediate-nolock' )
750753 return demote_reason
751754
752755 if self .is_standby_cluster () and self ._leader_timeline and \
@@ -1563,6 +1566,7 @@ def demote(self, mode: str) -> Optional[bool]:
15631566 'graceful' : dict (stop = 'fast' , checkpoint = True , release = True , offline = False , async_req = False ), # noqa: E241,E501
15641567 'immediate' : dict (stop = 'immediate' , checkpoint = False , release = True , offline = False , async_req = True ), # noqa: E241,E501
15651568 'immediate-nolock' : dict (stop = 'immediate' , checkpoint = False , release = False , offline = False , async_req = True ), # noqa: E241,E501
1569+ 'demote-cluster' : dict (stop = 'fast' , checkpoint = False , release = True , offline = False , async_req = False ), # noqa: E241,E501
15661570
15671571 }[mode ]
15681572
@@ -1572,6 +1576,16 @@ def demote(self, mode: str) -> Optional[bool]:
15721576
15731577 status = {'released' : False }
15741578
1579+ demote_cluster_with_archive = False
1580+ archive_cmd = self ._rewind .get_archive_command ()
1581+ if mode == 'demote-cluster' and archive_cmd is not None :
1582+ # We need to send the shutdown checkpoint WAL file to archive to eliminate the need of rewind
1583+ # from a promoted instance that was previously replicating from archive
1584+ # When doing this, we disable stop timeout, do not run on_shutdown callback and do not release
1585+ # leader key.
1586+ demote_cluster_with_archive = True
1587+ mode_control ['release' ] = False
1588+
15751589 def on_shutdown (checkpoint_location : int , prev_location : int ) -> None :
15761590 # Postmaster is still running, but pg_control already reports clean "shut down".
15771591 # It could happen if Postgres is still archiving the backlog of WAL files.
@@ -1580,8 +1594,11 @@ def on_shutdown(checkpoint_location: int, prev_location: int) -> None:
15801594 time .sleep (1 ) # give replicas some more time to catch up
15811595 if self .is_failover_possible (cluster_lsn = checkpoint_location ):
15821596 self .state_handler .set_role (PostgresqlRole .DEMOTED )
1597+ # for demotion to a standby cluster we need shutdown checkpoint lsn to be written to optime,
1598+ # not the prev one
1599+ last_lsn = checkpoint_location if mode == 'demote-cluster' else prev_location
15831600 with self ._async_executor :
1584- self .release_leader_key_voluntarily (prev_location )
1601+ self .release_leader_key_voluntarily (last_lsn )
15851602 status ['released' ] = True
15861603
15871604 def before_shutdown () -> None :
@@ -1594,16 +1611,33 @@ def before_shutdown() -> None:
15941611 on_safepoint = self .watchdog .disable if self .watchdog .is_running else None ,
15951612 on_shutdown = on_shutdown if mode_control ['release' ] else None ,
15961613 before_shutdown = before_shutdown if mode == 'graceful' else None ,
1597- stop_timeout = self .primary_stop_timeout ())
1614+ stop_timeout = None if demote_cluster_with_archive else self .primary_stop_timeout ())
15981615 self .state_handler .set_role (PostgresqlRole .DEMOTED )
1599- self .set_is_leader (False )
1616+
1617+ # for demotion to a standby cluster we need shutdown checkpoint lsn to be written to optime, not the prev one
1618+ checkpoint_lsn , prev_lsn = self .state_handler .latest_checkpoint_locations () \
1619+ if mode == 'graceful' else (None , None )
1620+
1621+ is_standby_leader = mode == 'demote-cluster' and not status ['released' ]
1622+ if is_standby_leader :
1623+ with self ._async_executor :
1624+ self .dcs .update_leader (self .cluster , checkpoint_lsn , None , self ._failsafe_config ())
1625+ mode_control ['release' ] = False
1626+ else :
1627+ self .set_is_leader (False )
16001628
16011629 if mode_control ['release' ]:
16021630 if not status ['released' ]:
1603- checkpoint_location = self .state_handler .latest_checkpoint_location () if mode == 'graceful' else None
16041631 with self ._async_executor :
1605- self .release_leader_key_voluntarily (checkpoint_location )
1632+ self .release_leader_key_voluntarily (prev_lsn )
16061633 time .sleep (2 ) # Give a time to somebody to take the leader lock
1634+
1635+ if mode == 'demote-cluster' :
1636+ if demote_cluster_with_archive :
1637+ self ._rewind .archive_shutdown_checkpoint_wal (cast (str , archive_cmd ))
1638+ else :
1639+ logger .info ('Not archiving latest checkpoint WAL file. Archiving is not configured.' )
1640+
16071641 if mode_control ['offline' ]:
16081642 node_to_follow , leader = None , None
16091643 else :
@@ -1616,15 +1650,17 @@ def before_shutdown() -> None:
16161650 if self .is_synchronous_mode ():
16171651 self .state_handler .sync_handler .set_synchronous_standby_names (CaseInsensitiveSet ())
16181652
1653+ role = PostgresqlRole .STANDBY_LEADER if is_standby_leader else PostgresqlRole .REPLICA
16191654 # FIXME: with mode offline called from DCS exception handler and handle_long_action_in_progress
16201655 # there could be an async action already running, calling follow from here will lead
16211656 # to racy state handler state updates.
16221657 if mode_control ['async_req' ]:
1623- self ._async_executor .try_run_async ('starting after demotion' , self .state_handler .follow , (node_to_follow ,))
1658+ self ._async_executor .try_run_async ('starting after demotion' , self .state_handler .follow ,
1659+ (node_to_follow , role ,))
16241660 else :
16251661 if self ._rewind .rewind_or_reinitialize_needed_and_possible (leader ):
16261662 return False # do not start postgres, but run pg_rewind on the next iteration
1627- self .state_handler .follow (node_to_follow )
1663+ self .state_handler .follow (node_to_follow , role )
16281664
16291665 def should_run_scheduled_action (self , action_name : str , scheduled_at : Optional [datetime .datetime ],
16301666 cleanup_fn : Callable [..., Any ]) -> bool :
@@ -2363,8 +2399,8 @@ def _before_shutdown() -> None:
23632399 stop_timeout = self .primary_stop_timeout ()))
23642400 if not self .state_handler .is_running ():
23652401 if self .is_leader () and not status ['deleted' ]:
2366- checkpoint_location = self .state_handler .latest_checkpoint_location ()
2367- self .dcs .delete_leader (self .cluster .leader , checkpoint_location )
2402+ _ , prev_location = self .state_handler .latest_checkpoint_locations ()
2403+ self .dcs .delete_leader (self .cluster .leader , prev_location )
23682404 self .touch_member ()
23692405 else :
23702406 # XXX: what about when Patroni is started as the wrong user that has access to the watchdog device
0 commit comments