@@ -178,7 +178,7 @@ def unit_number(unit_name: str):
178
178
# Lowest unit number is last to refresh
179
179
last_unit_to_refresh = sorted (all_units , key = unit_number )[0 ].replace ("/" , "-" )
180
180
if self ._charm ._patroni .get_primary () == last_unit_to_refresh :
181
- logging .info (
181
+ logger .info (
182
182
f"Unit { last_unit_to_refresh } was already primary during pre-refresh check"
183
183
)
184
184
else :
@@ -188,7 +188,7 @@ def unit_number(unit_name: str):
188
188
logger .warning (f"switchover failed with reason: { e } " )
189
189
raise charm_refresh .PrecheckFailed ("Unable to switch primary" )
190
190
else :
191
- logging .info (
191
+ logger .info (
192
192
f"Switched primary to unit { last_unit_to_refresh } during pre-refresh check"
193
193
)
194
194
@@ -408,6 +408,7 @@ def set_unit_status(
408
408
json .dumps (refresh .unit_status_lower_priority ().message )
409
409
)
410
410
return
411
+ logger .debug (f"Set unit status: { status } " )
411
412
self .unit .status = status
412
413
413
414
def _reconcile_refresh_status (self , _ = None ):
@@ -464,17 +465,10 @@ def patroni_scrape_config(self) -> list[dict]:
464
465
def app_units (self ) -> set [Unit ]:
465
466
"""The peer-related units in the application."""
466
467
if not self ._peers :
467
- return set ()
468
+ return { self . unit }
468
469
469
470
return {self .unit , * self ._peers .units }
470
471
471
- def scoped_peer_data (self , scope : Scopes ) -> dict | None :
472
- """Returns peer data based on scope."""
473
- if scope == APP_SCOPE :
474
- return self .app_peer_data
475
- elif scope == UNIT_SCOPE :
476
- return self .unit_peer_data
477
-
478
472
@property
479
473
def app_peer_data (self ) -> dict :
480
474
"""Application peer relation data object."""
@@ -607,7 +601,6 @@ def postgresql(self) -> PostgreSQL:
607
601
"""Returns an instance of the object used to interact with the database."""
608
602
password = str (self .get_secret (APP_SCOPE , f"{ USER } -password" ))
609
603
if self ._postgresql is None or self ._postgresql .primary_host is None :
610
- logger .debug ("Init class PostgreSQL" )
611
604
self ._postgresql = PostgreSQL (
612
605
primary_host = self .primary_endpoint ,
613
606
current_host = self ._unit_ip ,
@@ -628,21 +621,29 @@ def primary_endpoint(self) -> str | None:
628
621
return None
629
622
try :
630
623
primary = self ._patroni .get_primary ()
624
+ logger .debug (f"primary_endpoint: got primary '{ primary } '" )
631
625
if primary is None and (standby_leader := self ._patroni .get_standby_leader ()):
626
+ logger .debug (f"Using standby_leader { standby_leader } as primary" )
632
627
primary = standby_leader
633
628
primary_endpoint = self ._patroni .get_member_ip (primary )
629
+ logger .debug (f"primary_endpoint: got primary endpoint '{ primary_endpoint } '" )
634
630
# Force a retry if there is no primary or the member that was
635
631
# returned is not in the list of the current cluster members
636
632
# (like when the cluster was not updated yet after a failed switchover).
637
- if not primary_endpoint or primary_endpoint not in self ._units_ips :
638
- # TODO figure out why peer data is not available
639
- if primary_endpoint and len (self ._units_ips ) == 1 and len (self ._peers .units ) > 1 :
640
- logger .warning (
641
- "Possibly incomplete peer data: Will not map primary IP to unit IP"
642
- )
643
- return primary_endpoint
644
- logger .debug ("primary endpoint early exit: Primary IP not in cached peer list." )
633
+ if not primary_endpoint :
634
+ logger .warning (f"Missing primary IP for { primary } " )
645
635
primary_endpoint = None
636
+ elif primary_endpoint not in self ._units_ips :
637
+ if len (self ._peers .units ) == 0 :
638
+ logger .info (f"The unit didn't join { PEER } relation? Using { primary_endpoint } " )
639
+ elif len (self ._units_ips ) == 1 and len (self ._peers .units ) > 1 :
640
+ logger .warning (f"Possibly incomplete peer data, keep using { primary_endpoint } " )
641
+ else :
642
+ logger .debug ("Early exit primary_endpoint: Primary IP not in cached peer list" )
643
+ primary_endpoint = None
644
+ logger .debug ("primary_endpoint: %s" , primary_endpoint )
645
+ logger .debug ("self._units_ips: %s" , self ._units_ips )
646
+ logger .debug ("self._peers.units len: %s" , len (self ._peers .units ))
646
647
except RetryError :
647
648
return None
648
649
else :
@@ -859,6 +860,11 @@ def has_raft_keys(self):
859
860
def _peer_relation_changed_checks (self , event : HookEvent ) -> bool :
860
861
"""Split of to reduce complexity."""
861
862
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader.
863
+ logger .debug (f"Calling on_peer_relation_changed, event: '{ event } '" )
864
+ if event .unit is None : # or event.unit == self.unit.name:
865
+ logger .debug (f"Early exit on_peer_relation_changed: event to itself ({ event .unit } )" )
866
+ return False
867
+
862
868
if not self .is_cluster_initialised :
863
869
logger .debug ("Early exit on_peer_relation_changed: cluster not initialized" )
864
870
return False
@@ -1065,13 +1071,15 @@ def add_cluster_member(self, member: str) -> None:
1065
1071
NotReadyError if either the new member or the current members are not ready.
1066
1072
"""
1067
1073
unit = self .model .get_unit (label2name (member ))
1074
+ logger .debug (f"add_cluster_member: adding unit { unit } to the cluster" )
1068
1075
member_ip = self ._get_unit_ip (unit )
1069
1076
1070
1077
if not self ._patroni .are_all_members_ready ():
1071
1078
logger .info ("not all members are ready" )
1072
1079
raise NotReadyError ("not all members are ready" )
1073
1080
1074
- # Add the member to the list that should be updated in each other member.
1081
+ logger .debug (f"Add member { member_ip } to the members_ips list" )
1082
+ # It should be updated in each other member.
1075
1083
self ._add_to_members_ips (member_ip )
1076
1084
1077
1085
# Update Patroni configuration file.
@@ -1176,10 +1184,14 @@ def _units_ips(self) -> set[str]:
1176
1184
A list of peers addresses (strings).
1177
1185
"""
1178
1186
# Get all members IPs and remove the current unit IP from the list.
1187
+ for unit in self ._peers .units :
1188
+ logger .debug (f"_units_ips: unit { unit } " )
1179
1189
addresses = {self ._get_unit_ip (unit ) for unit in self ._peers .units }
1180
1190
addresses .add (self ._unit_ip )
1181
1191
if None in addresses :
1192
+ logger .debug ("_units_ips: Removing None" )
1182
1193
addresses .remove (None )
1194
+ logger .debug (f"_units_ips: Final addresses { addresses } " )
1183
1195
return addresses
1184
1196
1185
1197
@property
@@ -1310,7 +1322,7 @@ def _on_install(self, event: InstallEvent) -> None:
1310
1322
self ._reboot_on_detached_storage (event )
1311
1323
return
1312
1324
1313
- self .set_unit_status (MaintenanceStatus ("installing PostgreSQL" ))
1325
+ self .set_unit_status (MaintenanceStatus ("downloading & installing PostgreSQL" ))
1314
1326
1315
1327
# Install the charmed PostgreSQL snap.
1316
1328
try :
@@ -1413,10 +1425,8 @@ def _on_config_changed(self, event) -> None: # noqa: C901
1413
1425
return
1414
1426
1415
1427
if self .refresh is None :
1416
- logger .debug ("Defer on_config_changed: Refresh could be in progress" )
1417
- event .defer ()
1418
- return
1419
- if self .refresh .in_progress :
1428
+ logger .warning ("Warning _on_config_changed: Refresh could be in progress" )
1429
+ elif self .refresh .in_progress :
1420
1430
logger .debug ("Defer on_config_changed: Refresh in progress" )
1421
1431
event .defer ()
1422
1432
return
@@ -1488,14 +1498,17 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
1488
1498
continue
1489
1499
extension = PLUGIN_OVERRIDES .get (extension , extension )
1490
1500
if self ._check_extension_dependencies (extension , enable ):
1501
+ logger .debug (f"Early exit: { extension } has broken dependencies" )
1491
1502
self .set_unit_status (BlockedStatus (EXTENSIONS_DEPENDENCY_MESSAGE ))
1492
1503
return
1493
1504
extensions [extension ] = enable
1494
1505
if self .is_blocked and self .unit .status .message == EXTENSIONS_DEPENDENCY_MESSAGE :
1506
+ logger .debug ("Marking unit as Active" )
1495
1507
self .set_unit_status (ActiveStatus ())
1496
1508
original_status = self .unit .status
1497
1509
self .set_unit_status (WaitingStatus ("Updating extensions" ))
1498
1510
try :
1511
+ logger .debug ("Enabling/disabling PostgreSQL extensions..." )
1499
1512
self .postgresql .enable_disable_extensions (extensions , database )
1500
1513
except psycopg2 .errors .DependentObjectsStillExist as e :
1501
1514
logger .error (
@@ -1507,8 +1520,10 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
1507
1520
except PostgreSQLEnableDisableExtensionError as e :
1508
1521
logger .exception ("failed to change plugins: %s" , str (e ))
1509
1522
if original_status .message == EXTENSION_OBJECT_MESSAGE :
1523
+ logger .debug ("Marking unit as Active and finish with extensions" )
1510
1524
self .set_unit_status (ActiveStatus ())
1511
1525
return
1526
+ logger .debug (f"Restoring original unit status to { original_status } " )
1512
1527
self .set_unit_status (original_status )
1513
1528
1514
1529
def _check_extension_dependencies (self , extension : str , enable : bool ) -> bool :
@@ -1538,10 +1553,8 @@ def _can_start(self, event: StartEvent) -> bool:
1538
1553
1539
1554
# Safeguard against starting while refreshing.
1540
1555
if self .refresh is None :
1541
- logger .debug ("Defer on_start: Refresh could be in progress" )
1542
- event .defer ()
1543
- return False
1544
- if self .refresh .in_progress :
1556
+ logger .warning ("Warning on_start: Refresh could be in progress" )
1557
+ elif self .refresh .in_progress :
1545
1558
# TODO: we should probably start workload if scale up while refresh in progress
1546
1559
logger .debug ("Defer on_start: Refresh in progress" )
1547
1560
event .defer ()
@@ -1591,16 +1604,19 @@ def _on_start(self, event: StartEvent) -> None:
1591
1604
# Only the leader can bootstrap the cluster.
1592
1605
# On replicas, only prepare for starting the instance later.
1593
1606
if not self .unit .is_leader ():
1607
+ logger .debug ("Prepare for starting replica instance later" )
1594
1608
self ._start_replica (event )
1595
1609
self ._restart_services_after_reboot ()
1596
1610
return
1597
1611
1598
- # Bootstrap the cluster in the leader unit.
1612
+ logger . debug ( " Bootstrap the cluster in the leader unit" )
1599
1613
self ._start_primary (event )
1600
1614
self ._restart_services_after_reboot ()
1601
1615
1602
1616
def _restart_services_after_reboot (self ):
1603
1617
"""Restart the Patroni and pgBackRest after a reboot."""
1618
+ logger .debug (f"_restart_services_after_reboot: self._unit_ip: { self ._unit_ip } " )
1619
+ logger .debug (f"_restart_services_after_reboot: self.members_ips: { self .members_ips } " )
1604
1620
if self ._unit_ip in self .members_ips :
1605
1621
self ._patroni .start_patroni ()
1606
1622
self .backup .start_stop_pgbackrest_service ()
@@ -1689,6 +1705,8 @@ def _setup_ldap_sync(self, postgres_snap: snap.Snap | None = None) -> None:
1689
1705
postgres_snap .restart (services = ["ldap-sync" ])
1690
1706
1691
1707
def _setup_users (self ) -> None :
1708
+ """Create PostgreSQL users used/operated by charm."""
1709
+ logger .debug ("Setup PostgreSQL users" )
1692
1710
self .postgresql .create_predefined_instance_roles ()
1693
1711
1694
1712
# Create the default postgres database user that is needed for some
@@ -1697,14 +1715,14 @@ def _setup_users(self) -> None:
1697
1715
# This event can be run on a replica if the machines are restarted.
1698
1716
# For that case, check whether the postgres user already exits.
1699
1717
users = self .postgresql .list_users ()
1700
- # Create the backup user.
1701
1718
if BACKUP_USER not in users :
1719
+ logger .debug (f"Creating user { BACKUP_USER } " )
1702
1720
self .postgresql .create_user (
1703
1721
BACKUP_USER , new_password (), extra_user_roles = [ROLE_BACKUP ]
1704
1722
)
1705
1723
self .postgresql .grant_database_privileges_to_user (BACKUP_USER , "postgres" , ["connect" ])
1706
1724
if MONITORING_USER not in users :
1707
- # Create the monitoring user.
1725
+ logger . debug ( f"Creating user { MONITORING_USER } " )
1708
1726
self .postgresql .create_user (
1709
1727
MONITORING_USER ,
1710
1728
self .get_secret (APP_SCOPE , MONITORING_PASSWORD_KEY ),
@@ -1767,16 +1785,15 @@ def _start_primary(self, event: StartEvent) -> None:
1767
1785
event .defer ()
1768
1786
return
1769
1787
1770
- # Set the flag to enable the replicas to start the Patroni service.
1788
+ logger . debug ( " Set the flag to enable the replicas to start the Patroni service" )
1771
1789
self ._peers .data [self .app ]["cluster_initialised" ] = "True"
1772
1790
# Flag to know if triggers need to be removed after refresh
1773
1791
self ._peers .data [self .app ]["refresh_remove_trigger" ] = "True"
1774
1792
1775
- # Clear unit data if this unit became a replica after a failover/switchover.
1793
+ logger . debug ( " Clear unit data if this unit became a replica after a failover/switchover" )
1776
1794
self ._update_relation_endpoints ()
1777
1795
1778
- # Enable/disable PostgreSQL extensions if they were set before the cluster
1779
- # was fully initialised.
1796
+ # if extensions were set before the cluster was fully initialised.
1780
1797
self .enable_disable_extensions ()
1781
1798
1782
1799
logger .debug ("Active workload time: %s" , datetime .now ())
@@ -2001,6 +2018,7 @@ def _was_restore_successful(self) -> bool:
2001
2018
2002
2019
def _can_run_on_update_status (self ) -> bool :
2003
2020
if not self .is_cluster_initialised :
2021
+ logger .debug ("Early exit on_update_status: cluster is not initialised" )
2004
2022
return False
2005
2023
2006
2024
if self .has_raft_keys ():
@@ -2045,8 +2063,8 @@ def _handle_processes_failures(self) -> bool:
2045
2063
logger .info ("PostgreSQL data directory was not empty. Moved pg_wal" )
2046
2064
return True
2047
2065
try :
2048
- self ._patroni .restart_patroni ()
2049
2066
logger .info ("restarted PostgreSQL because it was not running" )
2067
+ self ._patroni .restart_patroni ()
2050
2068
return True
2051
2069
except RetryError :
2052
2070
logger .error ("failed to restart PostgreSQL after checking that it was not running" )
@@ -2061,6 +2079,7 @@ def _set_primary_status_message(self) -> None:
2061
2079
self .set_unit_status (
2062
2080
BlockedStatus (self .app_peer_data ["s3-initialization-block-message" ])
2063
2081
)
2082
+ logger .debug ("Early exit _set_primary_status_message: s3 is blocked" )
2064
2083
return
2065
2084
if self .unit .is_leader () and (
2066
2085
self .app_peer_data .get ("logical-replication-validation" ) == "error"
@@ -2077,12 +2096,12 @@ def _set_primary_status_message(self) -> None:
2077
2096
danger_state = " (read-only)"
2078
2097
elif len (self ._patroni .get_running_cluster_members ()) < self .app .planned_units ():
2079
2098
danger_state = " (degraded)"
2080
- self .set_unit_status (
2081
- ActiveStatus (
2082
- f"{ 'Standby' if self .is_standby_leader else 'Primary' } { danger_state } "
2083
- )
2084
- )
2099
+ unit_status = "Standby" if self .is_standby_leader else "Primary"
2100
+ unit_status = unit_status + danger_state
2101
+ logger .debug (f"Set ActiveStatus({ unit_status } )" )
2102
+ self .set_unit_status (ActiveStatus (f"{ unit_status } " ))
2085
2103
elif self ._patroni .member_started :
2104
+ logger .debug ("Set ActiveStatus()" )
2086
2105
self .set_unit_status (ActiveStatus ())
2087
2106
except (RetryError , ConnectionError ) as e :
2088
2107
logger .error (f"failed to get primary with error { e } " )
@@ -2286,13 +2305,16 @@ def _is_workload_running(self) -> bool:
2286
2305
@property
2287
2306
def _can_connect_to_postgresql (self ) -> bool :
2288
2307
try :
2289
- for attempt in Retrying (stop = stop_after_delay (30 ), wait = wait_fixed (3 )):
2308
+ for attempt in Retrying (stop = stop_after_delay (10 ), wait = wait_fixed (3 )):
2290
2309
with attempt :
2310
+ logger .debug ("Checking connection to PostgreSQL database..." )
2291
2311
if not self .postgresql .get_postgresql_timezones ():
2312
+ logger .debug ("Cannot connect to database (CannotConnectError)" )
2292
2313
raise CannotConnectError
2293
2314
except RetryError :
2294
- logger .debug ("Cannot connect to database" )
2315
+ logger .debug ("Cannot connect to database (RetryError) " )
2295
2316
return False
2317
+ logger .debug ("Successfully connected to the database" )
2296
2318
return True
2297
2319
2298
2320
def update_config (
@@ -2303,6 +2325,7 @@ def update_config(
2303
2325
refresh : charm_refresh .Machines | None = None ,
2304
2326
) -> bool :
2305
2327
"""Updates Patroni config file based on the existence of the TLS files."""
2328
+ logger .debug ("Updating Patroni config" )
2306
2329
if refresh is None :
2307
2330
refresh = self .refresh
2308
2331
@@ -2396,6 +2419,7 @@ def update_config(
2396
2419
2397
2420
self .unit_peer_data .update ({"user_hash" : self .generate_user_hash })
2398
2421
if self .unit .is_leader ():
2422
+ logger .debug (f"Updating user_hash in app databag on leader: { self .generate_user_hash } " )
2399
2423
self .app_peer_data .update ({"user_hash" : self .generate_user_hash })
2400
2424
return True
2401
2425
@@ -2415,6 +2439,7 @@ def _validate_config_options(self) -> None:
2415
2439
if not self .postgresql .validate_date_style (self .config .request_date_style ):
2416
2440
raise ValueError ("request_date_style config option has an invalid value" )
2417
2441
2442
+ logger .debug ("Checking timezone config options" )
2418
2443
if self .config .request_time_zone not in self .postgresql .get_postgresql_timezones ():
2419
2444
raise ValueError ("request_time_zone config option has an invalid value" )
2420
2445
@@ -2428,6 +2453,7 @@ def _validate_config_options(self) -> None:
2428
2453
2429
2454
def _handle_postgresql_restart_need (self ) -> None :
2430
2455
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
2456
+ logger .debug ("Checking for PostgreSQL restart necessity" )
2431
2457
if self ._can_connect_to_postgresql :
2432
2458
restart_postgresql = self .is_tls_enabled != self .postgresql .is_tls_enabled ()
2433
2459
else :
0 commit comments