@@ -178,7 +178,7 @@ def unit_number(unit_name: str):
178
178
# Lowest unit number is last to refresh
179
179
last_unit_to_refresh = sorted (all_units , key = unit_number )[0 ].replace ("/" , "-" )
180
180
if self ._charm ._patroni .get_primary () == last_unit_to_refresh :
181
- logging .info (
181
+ logger .info (
182
182
f"Unit { last_unit_to_refresh } was already primary during pre-refresh check"
183
183
)
184
184
else :
@@ -188,7 +188,7 @@ def unit_number(unit_name: str):
188
188
logger .warning (f"switchover failed with reason: { e } " )
189
189
raise charm_refresh .PrecheckFailed ("Unable to switch primary" )
190
190
else :
191
- logging .info (
191
+ logger .info (
192
192
f"Switched primary to unit { last_unit_to_refresh } during pre-refresh check"
193
193
)
194
194
@@ -408,6 +408,7 @@ def set_unit_status(
408
408
json .dumps (refresh .unit_status_lower_priority ().message )
409
409
)
410
410
return
411
+ logger .debug (f"Set unit status: { status } " )
411
412
self .unit .status = status
412
413
413
414
def _reconcile_refresh_status (self , _ = None ):
@@ -464,17 +465,10 @@ def patroni_scrape_config(self) -> list[dict]:
464
465
def app_units (self ) -> set [Unit ]:
465
466
"""The peer-related units in the application."""
466
467
if not self ._peers :
467
- return set ()
468
+ return { self . unit }
468
469
469
470
return {self .unit , * self ._peers .units }
470
471
471
- def scoped_peer_data (self , scope : Scopes ) -> dict | None :
472
- """Returns peer data based on scope."""
473
- if scope == APP_SCOPE :
474
- return self .app_peer_data
475
- elif scope == UNIT_SCOPE :
476
- return self .unit_peer_data
477
-
478
472
@property
479
473
def app_peer_data (self ) -> dict :
480
474
"""Application peer relation data object."""
@@ -607,7 +601,6 @@ def postgresql(self) -> PostgreSQL:
607
601
"""Returns an instance of the object used to interact with the database."""
608
602
password = str (self .get_secret (APP_SCOPE , f"{ USER } -password" ))
609
603
if self ._postgresql is None or self ._postgresql .primary_host is None :
610
- logger .debug ("Init class PostgreSQL" )
611
604
self ._postgresql = PostgreSQL (
612
605
primary_host = self .primary_endpoint ,
613
606
current_host = self ._unit_ip ,
@@ -628,21 +621,29 @@ def primary_endpoint(self) -> str | None:
628
621
return None
629
622
try :
630
623
primary = self ._patroni .get_primary ()
624
+ logger .debug (f"primary_endpoint: got primary '{ primary } '" )
631
625
if primary is None and (standby_leader := self ._patroni .get_standby_leader ()):
626
+ logger .debug (f"Using standby_leader { standby_leader } as primary" )
632
627
primary = standby_leader
633
628
primary_endpoint = self ._patroni .get_member_ip (primary )
629
+ logger .debug (f"primary_endpoint: got primary endpoint '{ primary_endpoint } '" )
634
630
# Force a retry if there is no primary or the member that was
635
631
# returned is not in the list of the current cluster members
636
632
# (like when the cluster was not updated yet after a failed switchover).
637
- if not primary_endpoint or primary_endpoint not in self ._units_ips :
638
- # TODO figure out why peer data is not available
639
- if primary_endpoint and len (self ._units_ips ) == 1 and len (self ._peers .units ) > 1 :
640
- logger .warning (
641
- "Possibly incomplete peer data: Will not map primary IP to unit IP"
642
- )
643
- return primary_endpoint
644
- logger .debug ("primary endpoint early exit: Primary IP not in cached peer list." )
633
+ if not primary_endpoint :
634
+ logger .warning (f"Missing primary IP for { primary } " )
645
635
primary_endpoint = None
636
+ elif primary_endpoint not in self ._units_ips :
637
+ if len (self ._peers .units ) == 0 :
638
+ logger .info (f"The unit didn't join { PEER } relation? Using { primary_endpoint } " )
639
+ elif len (self ._units_ips ) == 1 and len (self ._peers .units ) > 1 :
640
+ logger .warning (f"Possibly incomplete peer data, keep using { primary_endpoint } " )
641
+ else :
642
+ logger .debug ("Early exit primary_endpoint: Primary IP not in cached peer list" )
643
+ primary_endpoint = None
644
+ logger .debug ("primary_endpoint: %s" , primary_endpoint )
645
+ logger .debug ("self._units_ips: %s" , self ._units_ips )
646
+ logger .debug ("self._peers.units len: %s" , len (self ._peers .units ))
646
647
except RetryError :
647
648
return None
648
649
else :
@@ -859,6 +860,11 @@ def has_raft_keys(self):
859
860
def _peer_relation_changed_checks (self , event : HookEvent ) -> bool :
860
861
"""Split of to reduce complexity."""
861
862
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader.
863
+ logger .debug (f"Calling on_peer_relation_changed, event: '{ event } '" )
864
+ if hasattr (event , "unit" ) and event .unit is None :
865
+ logger .debug (f"Early exit on_peer_relation_changed: event to itself ({ event .unit } )" )
866
+ return False
867
+
862
868
if not self .is_cluster_initialised :
863
869
logger .debug ("Early exit on_peer_relation_changed: cluster not initialized" )
864
870
return False
@@ -1065,13 +1071,15 @@ def add_cluster_member(self, member: str) -> None:
1065
1071
NotReadyError if either the new member or the current members are not ready.
1066
1072
"""
1067
1073
unit = self .model .get_unit (label2name (member ))
1074
+ logger .debug (f"add_cluster_member: adding unit { unit } to the cluster" )
1068
1075
member_ip = self ._get_unit_ip (unit )
1069
1076
1070
1077
if not self ._patroni .are_all_members_ready ():
1071
1078
logger .info ("not all members are ready" )
1072
1079
raise NotReadyError ("not all members are ready" )
1073
1080
1074
- # Add the member to the list that should be updated in each other member.
1081
+ logger .debug (f"Add member { member_ip } to the members_ips list" )
1082
+ # It should be updated in each other member.
1075
1083
self ._add_to_members_ips (member_ip )
1076
1084
1077
1085
# Update Patroni configuration file.
@@ -1180,6 +1188,7 @@ def _units_ips(self) -> set[str]:
1180
1188
addresses .add (self ._unit_ip )
1181
1189
if None in addresses :
1182
1190
addresses .remove (None )
1191
+ logger .debug (f"_units_ips addresses: { addresses } " )
1183
1192
return addresses
1184
1193
1185
1194
@property
@@ -1310,7 +1319,7 @@ def _on_install(self, event: InstallEvent) -> None:
1310
1319
self ._reboot_on_detached_storage (event )
1311
1320
return
1312
1321
1313
- self .set_unit_status (MaintenanceStatus ("installing PostgreSQL" ))
1322
+ self .set_unit_status (MaintenanceStatus ("downloading & installing PostgreSQL" ))
1314
1323
1315
1324
# Install the charmed PostgreSQL snap.
1316
1325
try :
@@ -1413,10 +1422,8 @@ def _on_config_changed(self, event) -> None: # noqa: C901
1413
1422
return
1414
1423
1415
1424
if self .refresh is None :
1416
- logger .debug ("Defer on_config_changed: Refresh could be in progress" )
1417
- event .defer ()
1418
- return
1419
- if self .refresh .in_progress :
1425
+ logger .warning ("Warning _on_config_changed: Refresh could be in progress" )
1426
+ elif self .refresh .in_progress :
1420
1427
logger .debug ("Defer on_config_changed: Refresh in progress" )
1421
1428
event .defer ()
1422
1429
return
@@ -1488,14 +1495,17 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
1488
1495
continue
1489
1496
extension = PLUGIN_OVERRIDES .get (extension , extension )
1490
1497
if self ._check_extension_dependencies (extension , enable ):
1498
+ logger .debug (f"Early exit: { extension } has broken dependencies" )
1491
1499
self .set_unit_status (BlockedStatus (EXTENSIONS_DEPENDENCY_MESSAGE ))
1492
1500
return
1493
1501
extensions [extension ] = enable
1494
1502
if self .is_blocked and self .unit .status .message == EXTENSIONS_DEPENDENCY_MESSAGE :
1503
+ logger .debug ("Marking unit as Active" )
1495
1504
self .set_unit_status (ActiveStatus ())
1496
1505
original_status = self .unit .status
1497
1506
self .set_unit_status (WaitingStatus ("Updating extensions" ))
1498
1507
try :
1508
+ logger .debug ("Enabling/disabling PostgreSQL extensions..." )
1499
1509
self .postgresql .enable_disable_extensions (extensions , database )
1500
1510
except psycopg2 .errors .DependentObjectsStillExist as e :
1501
1511
logger .error (
@@ -1507,8 +1517,10 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
1507
1517
except PostgreSQLEnableDisableExtensionError as e :
1508
1518
logger .exception ("failed to change plugins: %s" , str (e ))
1509
1519
if original_status .message == EXTENSION_OBJECT_MESSAGE :
1520
+ logger .debug ("Marking unit as Active and finish with extensions" )
1510
1521
self .set_unit_status (ActiveStatus ())
1511
1522
return
1523
+ logger .debug (f"Restoring original unit status to { original_status } " )
1512
1524
self .set_unit_status (original_status )
1513
1525
1514
1526
def _check_extension_dependencies (self , extension : str , enable : bool ) -> bool :
@@ -1538,10 +1550,8 @@ def _can_start(self, event: StartEvent) -> bool:
1538
1550
1539
1551
# Safeguard against starting while refreshing.
1540
1552
if self .refresh is None :
1541
- logger .debug ("Defer on_start: Refresh could be in progress" )
1542
- event .defer ()
1543
- return False
1544
- if self .refresh .in_progress :
1553
+ logger .warning ("Warning on_start: Refresh could be in progress" )
1554
+ elif self .refresh .in_progress :
1545
1555
# TODO: we should probably start workload if scale up while refresh in progress
1546
1556
logger .debug ("Defer on_start: Refresh in progress" )
1547
1557
event .defer ()
@@ -1591,16 +1601,19 @@ def _on_start(self, event: StartEvent) -> None:
1591
1601
# Only the leader can bootstrap the cluster.
1592
1602
# On replicas, only prepare for starting the instance later.
1593
1603
if not self .unit .is_leader ():
1604
+ logger .debug ("Prepare for starting replica instance later" )
1594
1605
self ._start_replica (event )
1595
1606
self ._restart_services_after_reboot ()
1596
1607
return
1597
1608
1598
- # Bootstrap the cluster in the leader unit.
1609
+ logger . debug ( " Bootstrap the cluster in the leader unit" )
1599
1610
self ._start_primary (event )
1600
1611
self ._restart_services_after_reboot ()
1601
1612
1602
1613
def _restart_services_after_reboot (self ):
1603
1614
"""Restart the Patroni and pgBackRest after a reboot."""
1615
+ logger .debug (f"_restart_services_after_reboot: self._unit_ip: { self ._unit_ip } " )
1616
+ logger .debug (f"_restart_services_after_reboot: self.members_ips: { self .members_ips } " )
1604
1617
if self ._unit_ip in self .members_ips :
1605
1618
self ._patroni .start_patroni ()
1606
1619
self .backup .start_stop_pgbackrest_service ()
@@ -1689,6 +1702,8 @@ def _setup_ldap_sync(self, postgres_snap: snap.Snap | None = None) -> None:
1689
1702
postgres_snap .restart (services = ["ldap-sync" ])
1690
1703
1691
1704
def _setup_users (self ) -> None :
1705
+ """Create PostgreSQL users used/operated by charm."""
1706
+ logger .debug ("Setup PostgreSQL users" )
1692
1707
self .postgresql .create_predefined_instance_roles ()
1693
1708
1694
1709
# Create the default postgres database user that is needed for some
@@ -1697,14 +1712,14 @@ def _setup_users(self) -> None:
1697
1712
# This event can be run on a replica if the machines are restarted.
1698
1713
# For that case, check whether the postgres user already exits.
1699
1714
users = self .postgresql .list_users ()
1700
- # Create the backup user.
1701
1715
if BACKUP_USER not in users :
1716
+ logger .debug (f"Creating user { BACKUP_USER } " )
1702
1717
self .postgresql .create_user (
1703
1718
BACKUP_USER , new_password (), extra_user_roles = [ROLE_BACKUP ]
1704
1719
)
1705
1720
self .postgresql .grant_database_privileges_to_user (BACKUP_USER , "postgres" , ["connect" ])
1706
1721
if MONITORING_USER not in users :
1707
- # Create the monitoring user.
1722
+ logger . debug ( f"Creating user { MONITORING_USER } " )
1708
1723
self .postgresql .create_user (
1709
1724
MONITORING_USER ,
1710
1725
self .get_secret (APP_SCOPE , MONITORING_PASSWORD_KEY ),
@@ -1767,16 +1782,15 @@ def _start_primary(self, event: StartEvent) -> None:
1767
1782
event .defer ()
1768
1783
return
1769
1784
1770
- # Set the flag to enable the replicas to start the Patroni service.
1785
+ logger . debug ( " Set the flag to enable the replicas to start the Patroni service" )
1771
1786
self ._peers .data [self .app ]["cluster_initialised" ] = "True"
1772
1787
# Flag to know if triggers need to be removed after refresh
1773
1788
self ._peers .data [self .app ]["refresh_remove_trigger" ] = "True"
1774
1789
1775
- # Clear unit data if this unit became a replica after a failover/switchover.
1790
+ logger . debug ( " Clear unit data if this unit became a replica after a failover/switchover" )
1776
1791
self ._update_relation_endpoints ()
1777
1792
1778
- # Enable/disable PostgreSQL extensions if they were set before the cluster
1779
- # was fully initialised.
1793
+ # if extensions were set before the cluster was fully initialised.
1780
1794
self .enable_disable_extensions ()
1781
1795
1782
1796
logger .debug ("Active workload time: %s" , datetime .now ())
@@ -2001,6 +2015,7 @@ def _was_restore_successful(self) -> bool:
2001
2015
2002
2016
def _can_run_on_update_status (self ) -> bool :
2003
2017
if not self .is_cluster_initialised :
2018
+ logger .debug ("Early exit on_update_status: cluster is not initialised" )
2004
2019
return False
2005
2020
2006
2021
if self .has_raft_keys ():
@@ -2045,8 +2060,8 @@ def _handle_processes_failures(self) -> bool:
2045
2060
logger .info ("PostgreSQL data directory was not empty. Moved pg_wal" )
2046
2061
return True
2047
2062
try :
2048
- self ._patroni .restart_patroni ()
2049
2063
logger .info ("restarted PostgreSQL because it was not running" )
2064
+ self ._patroni .restart_patroni ()
2050
2065
return True
2051
2066
except RetryError :
2052
2067
logger .error ("failed to restart PostgreSQL after checking that it was not running" )
@@ -2061,6 +2076,7 @@ def _set_primary_status_message(self) -> None:
2061
2076
self .set_unit_status (
2062
2077
BlockedStatus (self .app_peer_data ["s3-initialization-block-message" ])
2063
2078
)
2079
+ logger .debug ("Early exit _set_primary_status_message: s3 is blocked" )
2064
2080
return
2065
2081
if self .unit .is_leader () and (
2066
2082
self .app_peer_data .get ("logical-replication-validation" ) == "error"
@@ -2077,12 +2093,12 @@ def _set_primary_status_message(self) -> None:
2077
2093
danger_state = " (read-only)"
2078
2094
elif len (self ._patroni .get_running_cluster_members ()) < self .app .planned_units ():
2079
2095
danger_state = " (degraded)"
2080
- self .set_unit_status (
2081
- ActiveStatus (
2082
- f"{ 'Standby' if self .is_standby_leader else 'Primary' } { danger_state } "
2083
- )
2084
- )
2096
+ unit_status = "Standby" if self .is_standby_leader else "Primary"
2097
+ unit_status = unit_status + danger_state
2098
+ logger .debug (f"Set ActiveStatus({ unit_status } )" )
2099
+ self .set_unit_status (ActiveStatus (f"{ unit_status } " ))
2085
2100
elif self ._patroni .member_started :
2101
+ logger .debug ("Set ActiveStatus()" )
2086
2102
self .set_unit_status (ActiveStatus ())
2087
2103
except (RetryError , ConnectionError ) as e :
2088
2104
logger .error (f"failed to get primary with error { e } " )
@@ -2286,13 +2302,16 @@ def _is_workload_running(self) -> bool:
2286
2302
@property
2287
2303
def _can_connect_to_postgresql (self ) -> bool :
2288
2304
try :
2289
- for attempt in Retrying (stop = stop_after_delay (30 ), wait = wait_fixed (3 )):
2305
+ for attempt in Retrying (stop = stop_after_delay (10 ), wait = wait_fixed (3 )):
2290
2306
with attempt :
2307
+ logger .debug ("Checking connection to PostgreSQL database..." )
2291
2308
if not self .postgresql .get_postgresql_timezones ():
2309
+ logger .debug ("Cannot connect to database (CannotConnectError)" )
2292
2310
raise CannotConnectError
2293
2311
except RetryError :
2294
- logger .debug ("Cannot connect to database" )
2312
+ logger .debug ("Cannot connect to database (RetryError) " )
2295
2313
return False
2314
+ logger .debug ("Successfully connected to the database" )
2296
2315
return True
2297
2316
2298
2317
def update_config (
@@ -2303,6 +2322,7 @@ def update_config(
2303
2322
refresh : charm_refresh .Machines | None = None ,
2304
2323
) -> bool :
2305
2324
"""Updates Patroni config file based on the existence of the TLS files."""
2325
+ logger .debug ("Updating Patroni config" )
2306
2326
if refresh is None :
2307
2327
refresh = self .refresh
2308
2328
@@ -2396,6 +2416,7 @@ def update_config(
2396
2416
2397
2417
self .unit_peer_data .update ({"user_hash" : self .generate_user_hash })
2398
2418
if self .unit .is_leader ():
2419
+ logger .debug (f"Updating user_hash in app databag on leader: { self .generate_user_hash } " )
2399
2420
self .app_peer_data .update ({"user_hash" : self .generate_user_hash })
2400
2421
return True
2401
2422
@@ -2415,6 +2436,7 @@ def _validate_config_options(self) -> None:
2415
2436
if not self .postgresql .validate_date_style (self .config .request_date_style ):
2416
2437
raise ValueError ("request_date_style config option has an invalid value" )
2417
2438
2439
+ logger .debug ("Checking timezone config options" )
2418
2440
if self .config .request_time_zone not in self .postgresql .get_postgresql_timezones ():
2419
2441
raise ValueError ("request_time_zone config option has an invalid value" )
2420
2442
@@ -2428,6 +2450,7 @@ def _validate_config_options(self) -> None:
2428
2450
2429
2451
def _handle_postgresql_restart_need (self ) -> None :
2430
2452
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
2453
+ logger .debug ("Checking for PostgreSQL restart necessity" )
2431
2454
if self ._can_connect_to_postgresql :
2432
2455
restart_postgresql = self .is_tls_enabled != self .postgresql .is_tls_enabled ()
2433
2456
else :
0 commit comments