Skip to content

Commit b6ed998

Browse files
Increase debug verbosity (especially for REST API requests)
* Also use common logger everywhere. (cherry picked from commit 0d332d9c22c6acb96a165576b9488c7fa31734f6)
1 parent 98f6ba4 commit b6ed998

File tree

4 files changed

+128
-60
lines changed

4 files changed

+128
-60
lines changed

lib/charms/postgresql_k8s/v1/postgresql.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,10 @@ def _connect_to_database(
255255
psycopg2 connection object.
256256
"""
257257
host = database_host if database_host is not None else self.primary_host
258+
dbname = database if database else self.database
259+
logger.debug(f"New DB connection: dbname='{dbname}' user='{self.user}' host='{host}' connect_timeout=1")
258260
connection = psycopg2.connect(
259-
f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'"
261+
f"dbname='{dbname}' user='{self.user}' host='{host}'"
260262
f"password='{self.password}' connect_timeout=1"
261263
)
262264
connection.autocommit = True

src/charm.py

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def unit_number(unit_name: str):
178178
# Lowest unit number is last to refresh
179179
last_unit_to_refresh = sorted(all_units, key=unit_number)[0].replace("/", "-")
180180
if self._charm._patroni.get_primary() == last_unit_to_refresh:
181-
logging.info(
181+
logger.info(
182182
f"Unit {last_unit_to_refresh} was already primary during pre-refresh check"
183183
)
184184
else:
@@ -188,7 +188,7 @@ def unit_number(unit_name: str):
188188
logger.warning(f"switchover failed with reason: {e}")
189189
raise charm_refresh.PrecheckFailed("Unable to switch primary")
190190
else:
191-
logging.info(
191+
logger.info(
192192
f"Switched primary to unit {last_unit_to_refresh} during pre-refresh check"
193193
)
194194

@@ -408,6 +408,7 @@ def set_unit_status(
408408
json.dumps(refresh.unit_status_lower_priority().message)
409409
)
410410
return
411+
logger.debug(f"Set unit status: {status}")
411412
self.unit.status = status
412413

413414
def _reconcile_refresh_status(self, _=None):
@@ -464,17 +465,10 @@ def patroni_scrape_config(self) -> list[dict]:
464465
def app_units(self) -> set[Unit]:
465466
"""The peer-related units in the application."""
466467
if not self._peers:
467-
return set()
468+
return {self.unit}
468469

469470
return {self.unit, *self._peers.units}
470471

471-
def scoped_peer_data(self, scope: Scopes) -> dict | None:
472-
"""Returns peer data based on scope."""
473-
if scope == APP_SCOPE:
474-
return self.app_peer_data
475-
elif scope == UNIT_SCOPE:
476-
return self.unit_peer_data
477-
478472
@property
479473
def app_peer_data(self) -> dict:
480474
"""Application peer relation data object."""
@@ -607,7 +601,6 @@ def postgresql(self) -> PostgreSQL:
607601
"""Returns an instance of the object used to interact with the database."""
608602
password = str(self.get_secret(APP_SCOPE, f"{USER}-password"))
609603
if self._postgresql is None or self._postgresql.primary_host is None:
610-
logger.debug("Init class PostgreSQL")
611604
self._postgresql = PostgreSQL(
612605
primary_host=self.primary_endpoint,
613606
current_host=self._unit_ip,
@@ -628,21 +621,29 @@ def primary_endpoint(self) -> str | None:
628621
return None
629622
try:
630623
primary = self._patroni.get_primary()
624+
logger.debug(f"primary_endpoint: got primary '{primary}'")
631625
if primary is None and (standby_leader := self._patroni.get_standby_leader()):
626+
logger.debug(f"Using standby_leader {standby_leader} as primary")
632627
primary = standby_leader
633628
primary_endpoint = self._patroni.get_member_ip(primary)
629+
logger.debug(f"primary_endpoint: got primary endpoint '{primary_endpoint}'")
634630
# Force a retry if there is no primary or the member that was
635631
# returned is not in the list of the current cluster members
636632
# (like when the cluster was not updated yet after a failed switchover).
637-
if not primary_endpoint or primary_endpoint not in self._units_ips:
638-
# TODO figure out why peer data is not available
639-
if primary_endpoint and len(self._units_ips) == 1 and len(self._peers.units) > 1:
640-
logger.warning(
641-
"Possibly incomplete peer data: Will not map primary IP to unit IP"
642-
)
643-
return primary_endpoint
644-
logger.debug("primary endpoint early exit: Primary IP not in cached peer list.")
633+
if not primary_endpoint:
634+
logger.warning(f"Missing primary IP for {primary}")
645635
primary_endpoint = None
636+
elif primary_endpoint not in self._units_ips:
637+
if len(self._peers.units) == 0:
638+
logger.info(f"The unit didn't join {PEER} relation? Using {primary_endpoint}")
639+
elif len(self._units_ips) == 1 and len(self._peers.units) > 1:
640+
logger.warning(f"Possibly incomplete peer data, keep using {primary_endpoint}")
641+
else:
642+
logger.debug("Early exit primary_endpoint: Primary IP not in cached peer list")
643+
primary_endpoint = None
644+
logger.debug("primary_endpoint: %s", primary_endpoint)
645+
logger.debug("self._units_ips: %s", self._units_ips)
646+
logger.debug("self._peers.units len: %s", len(self._peers.units))
646647
except RetryError:
647648
return None
648649
else:
@@ -859,6 +860,11 @@ def has_raft_keys(self):
859860
def _peer_relation_changed_checks(self, event: HookEvent) -> bool:
860861
"""Split of to reduce complexity."""
861862
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader.
863+
logger.debug(f"Calling on_peer_relation_changed, event: '{event}'")
864+
if event.unit is None: # or event.unit == self.unit.name:
865+
logger.debug(f"Early exit on_peer_relation_changed: event to itself ({event.unit})")
866+
return False
867+
862868
if not self.is_cluster_initialised:
863869
logger.debug("Early exit on_peer_relation_changed: cluster not initialized")
864870
return False
@@ -1065,13 +1071,15 @@ def add_cluster_member(self, member: str) -> None:
10651071
NotReadyError if either the new member or the current members are not ready.
10661072
"""
10671073
unit = self.model.get_unit(label2name(member))
1074+
logger.debug(f"add_cluster_member: adding unit {unit} to the cluster")
10681075
member_ip = self._get_unit_ip(unit)
10691076

10701077
if not self._patroni.are_all_members_ready():
10711078
logger.info("not all members are ready")
10721079
raise NotReadyError("not all members are ready")
10731080

1074-
# Add the member to the list that should be updated in each other member.
1081+
logger.debug(f"Add member {member_ip} to the members_ips list")
1082+
# It should be updated in each other member.
10751083
self._add_to_members_ips(member_ip)
10761084

10771085
# Update Patroni configuration file.
@@ -1176,10 +1184,14 @@ def _units_ips(self) -> set[str]:
11761184
A list of peers addresses (strings).
11771185
"""
11781186
# Get all members IPs and remove the current unit IP from the list.
1187+
for unit in self._peers.units:
1188+
logger.debug(f"_units_ips: unit {unit}")
11791189
addresses = {self._get_unit_ip(unit) for unit in self._peers.units}
11801190
addresses.add(self._unit_ip)
11811191
if None in addresses:
1192+
logger.debug("_units_ips: Removing None")
11821193
addresses.remove(None)
1194+
logger.debug(f"_units_ips: Final addresses {addresses}")
11831195
return addresses
11841196

11851197
@property
@@ -1310,7 +1322,7 @@ def _on_install(self, event: InstallEvent) -> None:
13101322
self._reboot_on_detached_storage(event)
13111323
return
13121324

1313-
self.set_unit_status(MaintenanceStatus("installing PostgreSQL"))
1325+
self.set_unit_status(MaintenanceStatus("downloading & installing PostgreSQL"))
13141326

13151327
# Install the charmed PostgreSQL snap.
13161328
try:
@@ -1413,10 +1425,8 @@ def _on_config_changed(self, event) -> None: # noqa: C901
14131425
return
14141426

14151427
if self.refresh is None:
1416-
logger.debug("Defer on_config_changed: Refresh could be in progress")
1417-
event.defer()
1418-
return
1419-
if self.refresh.in_progress:
1428+
logger.warning("Warning _on_config_changed: Refresh could be in progress")
1429+
elif self.refresh.in_progress:
14201430
logger.debug("Defer on_config_changed: Refresh in progress")
14211431
event.defer()
14221432
return
@@ -1488,14 +1498,17 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
14881498
continue
14891499
extension = PLUGIN_OVERRIDES.get(extension, extension)
14901500
if self._check_extension_dependencies(extension, enable):
1501+
logger.debug(f"Early exit: {extension} has broken dependencies")
14911502
self.set_unit_status(BlockedStatus(EXTENSIONS_DEPENDENCY_MESSAGE))
14921503
return
14931504
extensions[extension] = enable
14941505
if self.is_blocked and self.unit.status.message == EXTENSIONS_DEPENDENCY_MESSAGE:
1506+
logger.debug("Marking unit as Active")
14951507
self.set_unit_status(ActiveStatus())
14961508
original_status = self.unit.status
14971509
self.set_unit_status(WaitingStatus("Updating extensions"))
14981510
try:
1511+
logger.debug("Enabling/disabling PostgreSQL extensions...")
14991512
self.postgresql.enable_disable_extensions(extensions, database)
15001513
except psycopg2.errors.DependentObjectsStillExist as e:
15011514
logger.error(
@@ -1507,8 +1520,10 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
15071520
except PostgreSQLEnableDisableExtensionError as e:
15081521
logger.exception("failed to change plugins: %s", str(e))
15091522
if original_status.message == EXTENSION_OBJECT_MESSAGE:
1523+
logger.debug("Marking unit as Active and finish with extensions")
15101524
self.set_unit_status(ActiveStatus())
15111525
return
1526+
logger.debug(f"Restoring original unit status to {original_status}")
15121527
self.set_unit_status(original_status)
15131528

15141529
def _check_extension_dependencies(self, extension: str, enable: bool) -> bool:
@@ -1538,10 +1553,8 @@ def _can_start(self, event: StartEvent) -> bool:
15381553

15391554
# Safeguard against starting while refreshing.
15401555
if self.refresh is None:
1541-
logger.debug("Defer on_start: Refresh could be in progress")
1542-
event.defer()
1543-
return False
1544-
if self.refresh.in_progress:
1556+
logger.warning("Warning on_start: Refresh could be in progress")
1557+
elif self.refresh.in_progress:
15451558
# TODO: we should probably start workload if scale up while refresh in progress
15461559
logger.debug("Defer on_start: Refresh in progress")
15471560
event.defer()
@@ -1591,16 +1604,19 @@ def _on_start(self, event: StartEvent) -> None:
15911604
# Only the leader can bootstrap the cluster.
15921605
# On replicas, only prepare for starting the instance later.
15931606
if not self.unit.is_leader():
1607+
logger.debug("Prepare for starting replica instance later")
15941608
self._start_replica(event)
15951609
self._restart_services_after_reboot()
15961610
return
15971611

1598-
# Bootstrap the cluster in the leader unit.
1612+
logger.debug("Bootstrap the cluster in the leader unit")
15991613
self._start_primary(event)
16001614
self._restart_services_after_reboot()
16011615

16021616
def _restart_services_after_reboot(self):
16031617
"""Restart the Patroni and pgBackRest after a reboot."""
1618+
logger.debug(f"_restart_services_after_reboot: self._unit_ip: {self._unit_ip}")
1619+
logger.debug(f"_restart_services_after_reboot: self.members_ips: {self.members_ips}")
16041620
if self._unit_ip in self.members_ips:
16051621
self._patroni.start_patroni()
16061622
self.backup.start_stop_pgbackrest_service()
@@ -1689,6 +1705,8 @@ def _setup_ldap_sync(self, postgres_snap: snap.Snap | None = None) -> None:
16891705
postgres_snap.restart(services=["ldap-sync"])
16901706

16911707
def _setup_users(self) -> None:
1708+
"""Create PostgreSQL users used/operated by charm."""
1709+
logger.debug("Setup PostgreSQL users")
16921710
self.postgresql.create_predefined_instance_roles()
16931711

16941712
# Create the default postgres database user that is needed for some
@@ -1697,14 +1715,14 @@ def _setup_users(self) -> None:
16971715
# This event can be run on a replica if the machines are restarted.
16981716
# For that case, check whether the postgres user already exits.
16991717
users = self.postgresql.list_users()
1700-
# Create the backup user.
17011718
if BACKUP_USER not in users:
1719+
logger.debug(f"Creating user {BACKUP_USER}")
17021720
self.postgresql.create_user(
17031721
BACKUP_USER, new_password(), extra_user_roles=[ROLE_BACKUP]
17041722
)
17051723
self.postgresql.grant_database_privileges_to_user(BACKUP_USER, "postgres", ["connect"])
17061724
if MONITORING_USER not in users:
1707-
# Create the monitoring user.
1725+
logger.debug(f"Creating user {MONITORING_USER}")
17081726
self.postgresql.create_user(
17091727
MONITORING_USER,
17101728
self.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY),
@@ -1767,16 +1785,15 @@ def _start_primary(self, event: StartEvent) -> None:
17671785
event.defer()
17681786
return
17691787

1770-
# Set the flag to enable the replicas to start the Patroni service.
1788+
logger.debug("Set the flag to enable the replicas to start the Patroni service")
17711789
self._peers.data[self.app]["cluster_initialised"] = "True"
17721790
# Flag to know if triggers need to be removed after refresh
17731791
self._peers.data[self.app]["refresh_remove_trigger"] = "True"
17741792

1775-
# Clear unit data if this unit became a replica after a failover/switchover.
1793+
logger.debug("Clear unit data if this unit became a replica after a failover/switchover")
17761794
self._update_relation_endpoints()
17771795

1778-
# Enable/disable PostgreSQL extensions if they were set before the cluster
1779-
# was fully initialised.
1796+
# if extensions were set before the cluster was fully initialised.
17801797
self.enable_disable_extensions()
17811798

17821799
logger.debug("Active workload time: %s", datetime.now())
@@ -2001,6 +2018,7 @@ def _was_restore_successful(self) -> bool:
20012018

20022019
def _can_run_on_update_status(self) -> bool:
20032020
if not self.is_cluster_initialised:
2021+
logger.debug("Early exit on_update_status: cluster is not initialised")
20042022
return False
20052023

20062024
if self.has_raft_keys():
@@ -2045,8 +2063,8 @@ def _handle_processes_failures(self) -> bool:
20452063
logger.info("PostgreSQL data directory was not empty. Moved pg_wal")
20462064
return True
20472065
try:
2048-
self._patroni.restart_patroni()
20492066
logger.info("restarted PostgreSQL because it was not running")
2067+
self._patroni.restart_patroni()
20502068
return True
20512069
except RetryError:
20522070
logger.error("failed to restart PostgreSQL after checking that it was not running")
@@ -2061,6 +2079,7 @@ def _set_primary_status_message(self) -> None:
20612079
self.set_unit_status(
20622080
BlockedStatus(self.app_peer_data["s3-initialization-block-message"])
20632081
)
2082+
logger.debug("Early exit _set_primary_status_message: s3 is blocked")
20642083
return
20652084
if self.unit.is_leader() and (
20662085
self.app_peer_data.get("logical-replication-validation") == "error"
@@ -2077,12 +2096,12 @@ def _set_primary_status_message(self) -> None:
20772096
danger_state = " (read-only)"
20782097
elif len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
20792098
danger_state = " (degraded)"
2080-
self.set_unit_status(
2081-
ActiveStatus(
2082-
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
2083-
)
2084-
)
2099+
unit_status = "Standby" if self.is_standby_leader else "Primary"
2100+
unit_status = unit_status + danger_state
2101+
logger.debug(f"Set ActiveStatus({unit_status})")
2102+
self.set_unit_status(ActiveStatus(f"{unit_status}"))
20852103
elif self._patroni.member_started:
2104+
logger.debug("Set ActiveStatus()")
20862105
self.set_unit_status(ActiveStatus())
20872106
except (RetryError, ConnectionError) as e:
20882107
logger.error(f"failed to get primary with error {e}")
@@ -2286,13 +2305,16 @@ def _is_workload_running(self) -> bool:
22862305
@property
22872306
def _can_connect_to_postgresql(self) -> bool:
22882307
try:
2289-
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
2308+
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
22902309
with attempt:
2310+
logger.debug("Checking connection to PostgreSQL database...")
22912311
if not self.postgresql.get_postgresql_timezones():
2312+
logger.debug("Cannot connect to database (CannotConnectError)")
22922313
raise CannotConnectError
22932314
except RetryError:
2294-
logger.debug("Cannot connect to database")
2315+
logger.debug("Cannot connect to database (RetryError)")
22952316
return False
2317+
logger.debug("Successfully connected to the database")
22962318
return True
22972319

22982320
def update_config(
@@ -2303,6 +2325,7 @@ def update_config(
23032325
refresh: charm_refresh.Machines | None = None,
23042326
) -> bool:
23052327
"""Updates Patroni config file based on the existence of the TLS files."""
2328+
logger.debug("Updating Patroni config")
23062329
if refresh is None:
23072330
refresh = self.refresh
23082331

@@ -2396,6 +2419,7 @@ def update_config(
23962419

23972420
self.unit_peer_data.update({"user_hash": self.generate_user_hash})
23982421
if self.unit.is_leader():
2422+
logger.debug(f"Updating user_hash in app databag on leader: {self.generate_user_hash}")
23992423
self.app_peer_data.update({"user_hash": self.generate_user_hash})
24002424
return True
24012425

@@ -2415,6 +2439,7 @@ def _validate_config_options(self) -> None:
24152439
if not self.postgresql.validate_date_style(self.config.request_date_style):
24162440
raise ValueError("request_date_style config option has an invalid value")
24172441

2442+
logger.debug("Checking timezone config options")
24182443
if self.config.request_time_zone not in self.postgresql.get_postgresql_timezones():
24192444
raise ValueError("request_time_zone config option has an invalid value")
24202445

@@ -2428,6 +2453,7 @@ def _validate_config_options(self) -> None:
24282453

24292454
def _handle_postgresql_restart_need(self) -> None:
24302455
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
2456+
logger.debug("Checking for PostgreSQL restart necessity")
24312457
if self._can_connect_to_postgresql:
24322458
restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
24332459
else:

0 commit comments

Comments
 (0)