Skip to content

Commit 813089f

Browse files
Increase debug verbosity (especially for REST API requests)
* Also use common logger everywhere. (cherry picked from commit 0d332d9c22c6acb96a165576b9488c7fa31734f6)
1 parent d1d450c commit 813089f

File tree

4 files changed

+125
-60
lines changed

4 files changed

+125
-60
lines changed

lib/charms/postgresql_k8s/v1/postgresql.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,10 @@ def _connect_to_database(
255255
psycopg2 connection object.
256256
"""
257257
host = database_host if database_host is not None else self.primary_host
258+
dbname = database if database else self.database
259+
logger.debug(f"New DB connection: dbname='{dbname}' user='{self.user}' host='{host}' connect_timeout=1")
258260
connection = psycopg2.connect(
259-
f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'"
261+
f"dbname='{dbname}' user='{self.user}' host='{host}'"
260262
f"password='{self.password}' connect_timeout=1"
261263
)
262264
connection.autocommit = True

src/charm.py

Lines changed: 67 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def unit_number(unit_name: str):
178178
# Lowest unit number is last to refresh
179179
last_unit_to_refresh = sorted(all_units, key=unit_number)[0].replace("/", "-")
180180
if self._charm._patroni.get_primary() == last_unit_to_refresh:
181-
logging.info(
181+
logger.info(
182182
f"Unit {last_unit_to_refresh} was already primary during pre-refresh check"
183183
)
184184
else:
@@ -188,7 +188,7 @@ def unit_number(unit_name: str):
188188
logger.warning(f"switchover failed with reason: {e}")
189189
raise charm_refresh.PrecheckFailed("Unable to switch primary")
190190
else:
191-
logging.info(
191+
logger.info(
192192
f"Switched primary to unit {last_unit_to_refresh} during pre-refresh check"
193193
)
194194

@@ -408,6 +408,7 @@ def set_unit_status(
408408
json.dumps(refresh.unit_status_lower_priority().message)
409409
)
410410
return
411+
logger.debug(f"Set unit status: {status}")
411412
self.unit.status = status
412413

413414
def _reconcile_refresh_status(self, _=None):
@@ -464,17 +465,10 @@ def patroni_scrape_config(self) -> list[dict]:
464465
def app_units(self) -> set[Unit]:
465466
"""The peer-related units in the application."""
466467
if not self._peers:
467-
return set()
468+
return {self.unit}
468469

469470
return {self.unit, *self._peers.units}
470471

471-
def scoped_peer_data(self, scope: Scopes) -> dict | None:
472-
"""Returns peer data based on scope."""
473-
if scope == APP_SCOPE:
474-
return self.app_peer_data
475-
elif scope == UNIT_SCOPE:
476-
return self.unit_peer_data
477-
478472
@property
479473
def app_peer_data(self) -> dict:
480474
"""Application peer relation data object."""
@@ -607,7 +601,6 @@ def postgresql(self) -> PostgreSQL:
607601
"""Returns an instance of the object used to interact with the database."""
608602
password = str(self.get_secret(APP_SCOPE, f"{USER}-password"))
609603
if self._postgresql is None or self._postgresql.primary_host is None:
610-
logger.debug("Init class PostgreSQL")
611604
self._postgresql = PostgreSQL(
612605
primary_host=self.primary_endpoint,
613606
current_host=self._unit_ip,
@@ -628,21 +621,29 @@ def primary_endpoint(self) -> str | None:
628621
return None
629622
try:
630623
primary = self._patroni.get_primary()
624+
logger.debug(f"primary_endpoint: got primary '{primary}'")
631625
if primary is None and (standby_leader := self._patroni.get_standby_leader()):
626+
logger.debug(f"Using standby_leader {standby_leader} as primary")
632627
primary = standby_leader
633628
primary_endpoint = self._patroni.get_member_ip(primary)
629+
logger.debug(f"primary_endpoint: got primary endpoint '{primary_endpoint}'")
634630
# Force a retry if there is no primary or the member that was
635631
# returned is not in the list of the current cluster members
636632
# (like when the cluster was not updated yet after a failed switchover).
637-
if not primary_endpoint or primary_endpoint not in self._units_ips:
638-
# TODO figure out why peer data is not available
639-
if primary_endpoint and len(self._units_ips) == 1 and len(self._peers.units) > 1:
640-
logger.warning(
641-
"Possibly incomplete peer data: Will not map primary IP to unit IP"
642-
)
643-
return primary_endpoint
644-
logger.debug("primary endpoint early exit: Primary IP not in cached peer list.")
633+
if not primary_endpoint:
634+
logger.warning(f"Missing primary IP for {primary}")
645635
primary_endpoint = None
636+
elif primary_endpoint not in self._units_ips:
637+
if len(self._peers.units) == 0:
638+
logger.info(f"The unit didn't join {PEER} relation? Using {primary_endpoint}")
639+
elif len(self._units_ips) == 1 and len(self._peers.units) > 1:
640+
logger.warning(f"Possibly incomplete peer data, keep using {primary_endpoint}")
641+
else:
642+
logger.debug("Early exit primary_endpoint: Primary IP not in cached peer list")
643+
primary_endpoint = None
644+
logger.debug("primary_endpoint: %s", primary_endpoint)
645+
logger.debug("self._units_ips: %s", self._units_ips)
646+
logger.debug("self._peers.units len: %s", len(self._peers.units))
646647
except RetryError:
647648
return None
648649
else:
@@ -859,6 +860,11 @@ def has_raft_keys(self):
859860
def _peer_relation_changed_checks(self, event: HookEvent) -> bool:
860861
"""Split of to reduce complexity."""
861862
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader.
863+
logger.debug(f"Calling on_peer_relation_changed, event: '{event}'")
864+
if hasattr(event, "unit") and event.unit is None:
865+
logger.debug(f"Early exit on_peer_relation_changed: event to itself ({event.unit})")
866+
return False
867+
862868
if not self.is_cluster_initialised:
863869
logger.debug("Early exit on_peer_relation_changed: cluster not initialized")
864870
return False
@@ -1065,13 +1071,15 @@ def add_cluster_member(self, member: str) -> None:
10651071
NotReadyError if either the new member or the current members are not ready.
10661072
"""
10671073
unit = self.model.get_unit(label2name(member))
1074+
logger.debug(f"add_cluster_member: adding unit {unit} to the cluster")
10681075
member_ip = self._get_unit_ip(unit)
10691076

10701077
if not self._patroni.are_all_members_ready():
10711078
logger.info("not all members are ready")
10721079
raise NotReadyError("not all members are ready")
10731080

1074-
# Add the member to the list that should be updated in each other member.
1081+
logger.debug(f"Add member {member_ip} to the members_ips list")
1082+
# It should be updated in each other member.
10751083
self._add_to_members_ips(member_ip)
10761084

10771085
# Update Patroni configuration file.
@@ -1180,6 +1188,7 @@ def _units_ips(self) -> set[str]:
11801188
addresses.add(self._unit_ip)
11811189
if None in addresses:
11821190
addresses.remove(None)
1191+
logger.debug(f"_units_ips addresses: {addresses}")
11831192
return addresses
11841193

11851194
@property
@@ -1310,7 +1319,7 @@ def _on_install(self, event: InstallEvent) -> None:
13101319
self._reboot_on_detached_storage(event)
13111320
return
13121321

1313-
self.set_unit_status(MaintenanceStatus("installing PostgreSQL"))
1322+
self.set_unit_status(MaintenanceStatus("downloading & installing PostgreSQL"))
13141323

13151324
# Install the charmed PostgreSQL snap.
13161325
try:
@@ -1413,10 +1422,8 @@ def _on_config_changed(self, event) -> None: # noqa: C901
14131422
return
14141423

14151424
if self.refresh is None:
1416-
logger.debug("Defer on_config_changed: Refresh could be in progress")
1417-
event.defer()
1418-
return
1419-
if self.refresh.in_progress:
1425+
logger.warning("Warning _on_config_changed: Refresh could be in progress")
1426+
elif self.refresh.in_progress:
14201427
logger.debug("Defer on_config_changed: Refresh in progress")
14211428
event.defer()
14221429
return
@@ -1488,14 +1495,17 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
14881495
continue
14891496
extension = PLUGIN_OVERRIDES.get(extension, extension)
14901497
if self._check_extension_dependencies(extension, enable):
1498+
logger.debug(f"Early exit: {extension} has broken dependencies")
14911499
self.set_unit_status(BlockedStatus(EXTENSIONS_DEPENDENCY_MESSAGE))
14921500
return
14931501
extensions[extension] = enable
14941502
if self.is_blocked and self.unit.status.message == EXTENSIONS_DEPENDENCY_MESSAGE:
1503+
logger.debug("Marking unit as Active")
14951504
self.set_unit_status(ActiveStatus())
14961505
original_status = self.unit.status
14971506
self.set_unit_status(WaitingStatus("Updating extensions"))
14981507
try:
1508+
logger.debug("Enabling/disabling PostgreSQL extensions...")
14991509
self.postgresql.enable_disable_extensions(extensions, database)
15001510
except psycopg2.errors.DependentObjectsStillExist as e:
15011511
logger.error(
@@ -1507,8 +1517,10 @@ def enable_disable_extensions(self, database: str | None = None) -> None:
15071517
except PostgreSQLEnableDisableExtensionError as e:
15081518
logger.exception("failed to change plugins: %s", str(e))
15091519
if original_status.message == EXTENSION_OBJECT_MESSAGE:
1520+
logger.debug("Marking unit as Active and finish with extensions")
15101521
self.set_unit_status(ActiveStatus())
15111522
return
1523+
logger.debug(f"Restoring original unit status to {original_status}")
15121524
self.set_unit_status(original_status)
15131525

15141526
def _check_extension_dependencies(self, extension: str, enable: bool) -> bool:
@@ -1538,10 +1550,8 @@ def _can_start(self, event: StartEvent) -> bool:
15381550

15391551
# Safeguard against starting while refreshing.
15401552
if self.refresh is None:
1541-
logger.debug("Defer on_start: Refresh could be in progress")
1542-
event.defer()
1543-
return False
1544-
if self.refresh.in_progress:
1553+
logger.warning("Warning on_start: Refresh could be in progress")
1554+
elif self.refresh.in_progress:
15451555
# TODO: we should probably start workload if scale up while refresh in progress
15461556
logger.debug("Defer on_start: Refresh in progress")
15471557
event.defer()
@@ -1591,16 +1601,19 @@ def _on_start(self, event: StartEvent) -> None:
15911601
# Only the leader can bootstrap the cluster.
15921602
# On replicas, only prepare for starting the instance later.
15931603
if not self.unit.is_leader():
1604+
logger.debug("Prepare for starting replica instance later")
15941605
self._start_replica(event)
15951606
self._restart_services_after_reboot()
15961607
return
15971608

1598-
# Bootstrap the cluster in the leader unit.
1609+
logger.debug("Bootstrap the cluster in the leader unit")
15991610
self._start_primary(event)
16001611
self._restart_services_after_reboot()
16011612

16021613
def _restart_services_after_reboot(self):
16031614
"""Restart the Patroni and pgBackRest after a reboot."""
1615+
logger.debug(f"_restart_services_after_reboot: self._unit_ip: {self._unit_ip}")
1616+
logger.debug(f"_restart_services_after_reboot: self.members_ips: {self.members_ips}")
16041617
if self._unit_ip in self.members_ips:
16051618
self._patroni.start_patroni()
16061619
self.backup.start_stop_pgbackrest_service()
@@ -1689,6 +1702,8 @@ def _setup_ldap_sync(self, postgres_snap: snap.Snap | None = None) -> None:
16891702
postgres_snap.restart(services=["ldap-sync"])
16901703

16911704
def _setup_users(self) -> None:
1705+
"""Create PostgreSQL users used/operated by charm."""
1706+
logger.debug("Setup PostgreSQL users")
16921707
self.postgresql.create_predefined_instance_roles()
16931708

16941709
# Create the default postgres database user that is needed for some
@@ -1697,14 +1712,14 @@ def _setup_users(self) -> None:
16971712
# This event can be run on a replica if the machines are restarted.
16981713
# For that case, check whether the postgres user already exits.
16991714
users = self.postgresql.list_users()
1700-
# Create the backup user.
17011715
if BACKUP_USER not in users:
1716+
logger.debug(f"Creating user {BACKUP_USER}")
17021717
self.postgresql.create_user(
17031718
BACKUP_USER, new_password(), extra_user_roles=[ROLE_BACKUP]
17041719
)
17051720
self.postgresql.grant_database_privileges_to_user(BACKUP_USER, "postgres", ["connect"])
17061721
if MONITORING_USER not in users:
1707-
# Create the monitoring user.
1722+
logger.debug(f"Creating user {MONITORING_USER}")
17081723
self.postgresql.create_user(
17091724
MONITORING_USER,
17101725
self.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY),
@@ -1767,16 +1782,15 @@ def _start_primary(self, event: StartEvent) -> None:
17671782
event.defer()
17681783
return
17691784

1770-
# Set the flag to enable the replicas to start the Patroni service.
1785+
logger.debug("Set the flag to enable the replicas to start the Patroni service")
17711786
self._peers.data[self.app]["cluster_initialised"] = "True"
17721787
# Flag to know if triggers need to be removed after refresh
17731788
self._peers.data[self.app]["refresh_remove_trigger"] = "True"
17741789

1775-
# Clear unit data if this unit became a replica after a failover/switchover.
1790+
logger.debug("Clear unit data if this unit became a replica after a failover/switchover")
17761791
self._update_relation_endpoints()
17771792

1778-
# Enable/disable PostgreSQL extensions if they were set before the cluster
1779-
# was fully initialised.
1793+
# if extensions were set before the cluster was fully initialised.
17801794
self.enable_disable_extensions()
17811795

17821796
logger.debug("Active workload time: %s", datetime.now())
@@ -2001,6 +2015,7 @@ def _was_restore_successful(self) -> bool:
20012015

20022016
def _can_run_on_update_status(self) -> bool:
20032017
if not self.is_cluster_initialised:
2018+
logger.debug("Early exit on_update_status: cluster is not initialised")
20042019
return False
20052020

20062021
if self.has_raft_keys():
@@ -2045,8 +2060,8 @@ def _handle_processes_failures(self) -> bool:
20452060
logger.info("PostgreSQL data directory was not empty. Moved pg_wal")
20462061
return True
20472062
try:
2048-
self._patroni.restart_patroni()
20492063
logger.info("restarted PostgreSQL because it was not running")
2064+
self._patroni.restart_patroni()
20502065
return True
20512066
except RetryError:
20522067
logger.error("failed to restart PostgreSQL after checking that it was not running")
@@ -2061,6 +2076,7 @@ def _set_primary_status_message(self) -> None:
20612076
self.set_unit_status(
20622077
BlockedStatus(self.app_peer_data["s3-initialization-block-message"])
20632078
)
2079+
logger.debug("Early exit _set_primary_status_message: s3 is blocked")
20642080
return
20652081
if self.unit.is_leader() and (
20662082
self.app_peer_data.get("logical-replication-validation") == "error"
@@ -2077,12 +2093,12 @@ def _set_primary_status_message(self) -> None:
20772093
danger_state = " (read-only)"
20782094
elif len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
20792095
danger_state = " (degraded)"
2080-
self.set_unit_status(
2081-
ActiveStatus(
2082-
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
2083-
)
2084-
)
2096+
unit_status = "Standby" if self.is_standby_leader else "Primary"
2097+
unit_status = unit_status + danger_state
2098+
logger.debug(f"Set ActiveStatus({unit_status})")
2099+
self.set_unit_status(ActiveStatus(f"{unit_status}"))
20852100
elif self._patroni.member_started:
2101+
logger.debug("Set ActiveStatus()")
20862102
self.set_unit_status(ActiveStatus())
20872103
except (RetryError, ConnectionError) as e:
20882104
logger.error(f"failed to get primary with error {e}")
@@ -2286,13 +2302,16 @@ def _is_workload_running(self) -> bool:
22862302
@property
22872303
def _can_connect_to_postgresql(self) -> bool:
22882304
try:
2289-
for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
2305+
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
22902306
with attempt:
2307+
logger.debug("Checking connection to PostgreSQL database...")
22912308
if not self.postgresql.get_postgresql_timezones():
2309+
logger.debug("Cannot connect to database (CannotConnectError)")
22922310
raise CannotConnectError
22932311
except RetryError:
2294-
logger.debug("Cannot connect to database")
2312+
logger.debug("Cannot connect to database (RetryError)")
22952313
return False
2314+
logger.debug("Successfully connected to the database")
22962315
return True
22972316

22982317
def update_config(
@@ -2303,6 +2322,7 @@ def update_config(
23032322
refresh: charm_refresh.Machines | None = None,
23042323
) -> bool:
23052324
"""Updates Patroni config file based on the existence of the TLS files."""
2325+
logger.debug("Updating Patroni config")
23062326
if refresh is None:
23072327
refresh = self.refresh
23082328

@@ -2396,6 +2416,7 @@ def update_config(
23962416

23972417
self.unit_peer_data.update({"user_hash": self.generate_user_hash})
23982418
if self.unit.is_leader():
2419+
logger.debug(f"Updating user_hash in app databag on leader: {self.generate_user_hash}")
23992420
self.app_peer_data.update({"user_hash": self.generate_user_hash})
24002421
return True
24012422

@@ -2415,6 +2436,7 @@ def _validate_config_options(self) -> None:
24152436
if not self.postgresql.validate_date_style(self.config.request_date_style):
24162437
raise ValueError("request_date_style config option has an invalid value")
24172438

2439+
logger.debug("Checking timezone config options")
24182440
if self.config.request_time_zone not in self.postgresql.get_postgresql_timezones():
24192441
raise ValueError("request_time_zone config option has an invalid value")
24202442

@@ -2428,6 +2450,7 @@ def _validate_config_options(self) -> None:
24282450

24292451
def _handle_postgresql_restart_need(self) -> None:
24302452
"""Handle PostgreSQL restart need based on the TLS configuration and configuration changes."""
2453+
logger.debug("Checking for PostgreSQL restart necessity")
24312454
if self._can_connect_to_postgresql:
24322455
restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled()
24332456
else:

0 commit comments

Comments
 (0)