45
45
Container ,
46
46
JujuVersion ,
47
47
MaintenanceStatus ,
48
+ ModelError ,
48
49
Relation ,
49
50
Unit ,
50
51
UnknownStatus ,
51
52
WaitingStatus ,
52
53
)
53
54
from ops .pebble import ChangeError , Layer , PathError , ProtocolError , ServiceStatus
54
55
from requests import ConnectionError
55
- from tenacity import RetryError , Retrying , stop_after_attempt , wait_fixed
56
+ from tenacity import RetryError , Retrying , stop_after_attempt , stop_after_delay , wait_fixed
56
57
57
58
from backups import PostgreSQLBackups
58
59
from config import CharmConfig
80
81
WORKLOAD_OS_GROUP ,
81
82
WORKLOAD_OS_USER ,
82
83
)
83
- from patroni import NotReadyError , Patroni
84
+ from patroni import NotReadyError , Patroni , SwitchoverFailedError
84
85
from relations .async_replication import PostgreSQLAsyncReplication
85
86
from relations .db import EXTENSIONS_BLOCKING_MESSAGE , DbProvides
86
87
from relations .postgresql_provider import PostgreSQLProvider
@@ -144,6 +145,7 @@ def __init__(self, *args):
144
145
self .framework .observe (self .on .secret_changed , self ._on_peer_relation_changed )
145
146
self .framework .observe (self .on [PEER ].relation_departed , self ._on_peer_relation_departed )
146
147
self .framework .observe (self .on .postgresql_pebble_ready , self ._on_postgresql_pebble_ready )
148
+ self .framework .observe (self .on .pgdata_storage_detaching , self ._on_pgdata_storage_detaching )
147
149
self .framework .observe (self .on .stop , self ._on_stop )
148
150
self .framework .observe (self .on .upgrade_charm , self ._on_upgrade_charm )
149
151
self .framework .observe (self .on .get_password_action , self ._on_get_password )
@@ -379,14 +381,68 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
379
381
# Update the sync-standby endpoint in the async replication data.
380
382
self .async_replication .update_async_replication_data ()
381
383
382
- def _on_peer_relation_changed (self , event : HookEvent ) -> None :
384
+ def _on_pgdata_storage_detaching (self , _ ) -> None :
385
+ # Change the primary if it's the unit that is being removed.
386
+ try :
387
+ primary = self ._patroni .get_primary (unit_name_pattern = True )
388
+ except RetryError :
389
+ # Ignore the event if the primary couldn't be retrieved.
390
+ # If a switchover is needed, an automatic failover will be triggered
391
+ # when the unit is removed.
392
+ logger .debug ("Early exit on_pgdata_storage_detaching: primary cannot be retrieved" )
393
+ return
394
+
395
+ if self .unit .name != primary :
396
+ return
397
+
398
+ if not self ._patroni .are_all_members_ready ():
399
+ logger .warning (
400
+ "could not switchover because not all members are ready"
401
+ " - an automatic failover will be triggered"
402
+ )
403
+ return
404
+
405
+ # Try to switchover to another member and raise an exception if it doesn't succeed.
406
+ # If it doesn't happen on time, Patroni will automatically run a fail-over.
407
+ try :
408
+ # Get the current primary to check if it has changed later.
409
+ current_primary = self ._patroni .get_primary ()
410
+
411
+ # Trigger the switchover.
412
+ self ._patroni .switchover ()
413
+
414
+ # Wait for the switchover to complete.
415
+ self ._patroni .primary_changed (current_primary )
416
+
417
+ logger .info ("successful switchover" )
418
+ except (RetryError , SwitchoverFailedError ) as e :
419
+ logger .warning (
420
+ f"switchover failed with reason: { e } - an automatic failover will be triggered"
421
+ )
422
+ return
423
+
424
+ # Only update the connection endpoints if there is a primary.
425
+ # A cluster can have all members as replicas for some time after
426
+ # a failed switchover, so wait until the primary is elected.
427
+ endpoints_to_remove = self ._get_endpoints_to_remove ()
428
+ self .postgresql_client_relation .update_read_only_endpoint ()
429
+ self ._remove_from_endpoints (endpoints_to_remove )
430
+
431
+ def _on_peer_relation_changed (self , event : HookEvent ) -> None : # noqa: C901
383
432
"""Reconfigure cluster members."""
384
433
# The cluster must be initialized first in the leader unit
385
434
# before any other member joins the cluster.
386
435
if "cluster_initialised" not in self ._peers .data [self .app ]:
387
- logger .debug (
388
- "Deferring on_peer_relation_changed: Cluster must be initialized before members can join"
389
- )
436
+ if self .unit .is_leader ():
437
+ if self ._initialize_cluster (event ):
438
+ logger .debug ("Deferring on_peer_relation_changed: Leader initialized cluster" )
439
+ else :
440
+ logger .debug ("_initialized_cluster failed on _peer_relation_changed" )
441
+ return
442
+ else :
443
+ logger .debug (
444
+ "Deferring on_peer_relation_changed: Cluster must be initialized before members can join"
445
+ )
390
446
event .defer ()
391
447
return
392
448
@@ -437,7 +493,10 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None:
437
493
event .defer ()
438
494
return
439
495
440
- self .postgresql_client_relation .update_read_only_endpoint ()
496
+ try :
497
+ self .postgresql_client_relation .update_read_only_endpoint ()
498
+ except ModelError as e :
499
+ logger .warning ("Cannot update read_only endpoints: %s" , str (e ))
441
500
442
501
self .backup .coordinate_stanza_fields ()
443
502
@@ -594,6 +653,9 @@ def _add_members(self, event) -> None:
594
653
except NotReadyError :
595
654
logger .info ("Deferring reconfigure: another member doing sync right now" )
596
655
event .defer ()
656
+ except RetryError :
657
+ logger .info ("Deferring reconfigure: failed to obtain cluster members from Patroni" )
658
+ event .defer ()
597
659
598
660
def add_cluster_member (self , member : str ) -> None :
599
661
"""Add member to the cluster if all members are already up and running.
@@ -1432,6 +1494,14 @@ def _restart(self, event: RunWithLock) -> None:
1432
1494
# Update health check URL.
1433
1495
self ._update_pebble_layers ()
1434
1496
1497
+ try :
1498
+ for attempt in Retrying (wait = wait_fixed (3 ), stop = stop_after_delay (300 )):
1499
+ with attempt :
1500
+ if not self ._can_connect_to_postgresql :
1501
+ assert False
1502
+ except Exception :
1503
+ logger .exception ("Unable to reconnect to postgresql" )
1504
+
1435
1505
# Start or stop the pgBackRest TLS server service when TLS certificate change.
1436
1506
self .backup .start_stop_pgbackrest_service ()
1437
1507
@@ -1448,6 +1518,17 @@ def _is_workload_running(self) -> bool:
1448
1518
1449
1519
return services [0 ].current == ServiceStatus .ACTIVE
1450
1520
1521
+ @property
1522
+ def _can_connect_to_postgresql (self ) -> bool :
1523
+ try :
1524
+ for attempt in Retrying (stop = stop_after_delay (30 ), wait = wait_fixed (3 )):
1525
+ with attempt :
1526
+ assert self .postgresql .get_postgresql_timezones ()
1527
+ except RetryError :
1528
+ logger .debug ("Cannot connect to database" )
1529
+ return False
1530
+ return True
1531
+
1451
1532
def update_config (self , is_creating_backup : bool = False ) -> bool :
1452
1533
"""Updates Patroni config file based on the existence of the TLS files."""
1453
1534
# Retrieve PostgreSQL parameters.
0 commit comments