Skip to content

Commit 39f6f8c

Browse files
committed
Merge branch '16/edge' into test-app-16
2 parents f060df9 + 40e678b commit 39f6f8c

File tree

31 files changed

+516
-134
lines changed

31 files changed

+516
-134
lines changed

docs/explanation/logs.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ The naming convention of the Pgbackrest logs is `<model name>.patroni-<postgresq
7878
2023-10-11 13:07:45.146 P00 INFO: expire command end: completed successfully (353ms)
7979
root@pg-0:/# cat /var/log/pgbackrest/discourse.patroni-pg-backup.log
8080
-------------------PROCESS START-------------------
81-
2023-10-11 13:06:29.857 P00 INFO: backup command begin 2.47: --no-backup-standby --exec-id=843-b0d896e1 --log-level-console=debug --pg1-path=/var/lib/postgresql/data/pgdata --pg1-user=backup --repo1-path=/postgresql-test --repo1-retention-full=9999999 --repo1-s3-bucket=dragop-test-bucket --repo1-s3-endpoint=https://s3.eu-central-1.amazonaws.com --repo1-s3-key=<redacted> --repo1-s3-key-secret=<redacted> --repo1-s3-region=eu-central-1 --repo1-s3-uri-style=host --repo1-type=s3 --stanza=discourse.patroni-pg --start-fast --type=full
81+
2023-10-11 13:06:29.857 P00 INFO: backup command begin 2.47: --no-backup-standby --exec-id=843-b0d896e1 --log-level-console=debug --pg1-path=/var/lib/pg/data/16/main --pg1-user=backup --repo1-path=/postgresql-test --repo1-retention-full=9999999 --repo1-s3-bucket=dragop-test-bucket --repo1-s3-endpoint=https://s3.eu-central-1.amazonaws.com --repo1-s3-key=<redacted> --repo1-s3-key-secret=<redacted> --repo1-s3-region=eu-central-1 --repo1-s3-uri-style=host --repo1-type=s3 --stanza=discourse.patroni-pg --start-fast --type=full
8282
2023-10-11 13:06:30.869 P00 INFO: execute non-exclusive backup start: backup begins after the requested immediate checkpoint completes
8383
2023-10-11 13:06:31.671 P00 INFO: backup start archive = 000000010000000000000004, lsn = 0/4000060
8484
2023-10-11 13:06:31.671 P00 INFO: check archive for prior segment 000000010000000000000003

docs/reference/troubleshooting.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ postgresql enabled active today at 12:29 UTC
119119
root@postgresql-k8s-0:/# ps auxww
120120
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
121121
root 1 0.0 0.0 718264 10916 ? Ssl 12:29 0:00 /charm/bin/pebble run --create-dirs --hold --http :38813 --verbose
122-
postgres 14 0.1 0.1 565020 39412 ? Sl 12:29 0:01 python3 /usr/bin/patroni /var/lib/postgresql/data/patroni.yml
122+
postgres 14 0.1 0.1 565020 39412 ? Sl 12:29 0:01 python3 /usr/bin/patroni /var/lib/pg/data/patroni.yml
123123
postgres 30 0.0 0.0 1082704 9076 ? Sl 12:30 0:00 /usr/bin/prometheus-postgres-exporter
124-
postgres 48 0.0 0.0 215488 28912 ? S 12:30 0:00 /usr/lib/postgresql/14/bin/postgres -D /var/lib/postgresql/data/pgdata --config-file=/var/lib/postgresql/data/pgdata/postgresql.conf --listen_addresses=0.0.0.0 --port=5432 --cluster_name=patroni-postgresql-k8s --wal_level=logical --hot_standby=on --max_connections=100 --max_wal_senders=10 --max_prepared_transactions=0 --max_locks_per_transaction=64 --track_commit_timestamp=off --max_replication_slots=10 --max_worker_processes=8 --wal_log_hints=on
124+
postgres 48 0.0 0.0 215488 28912 ? S 12:30 0:00 /usr/lib/postgresql/16/bin/postgres -D /var/lib/pg/data/16/main --config-file=/var/lib/pg/data/16/main/postgresql.conf --listen_addresses=0.0.0.0 --port=5432 --cluster_name=patroni-postgresql-k8s --wal_level=logical --hot_standby=on --max_connections=100 --max_wal_senders=10 --max_prepared_transactions=0 --max_locks_per_transaction=64 --track_commit_timestamp=off --max_replication_slots=10 --max_worker_processes=8 --wal_log_hints=on
125125
postgres 50 0.0 0.0 70080 7488 ? Ss 12:30 0:00 postgres: patroni-postgresql-k8s: logger
126126
postgres 52 0.0 0.0 215592 9136 ? Ss 12:30 0:00 postgres: patroni-postgresql-k8s: checkpointer
127127
postgres 53 0.0 0.0 215604 9632 ? Ss 12:30 0:00 postgres: patroni-postgresql-k8s: background writer

metadata.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,19 @@ containers:
2929
resource: postgresql-image
3030
mounts:
3131
- storage: archive
32-
location: /var/lib/postgresql/archive
32+
location: /var/lib/pg/archive
3333
- storage: data
34-
location: /var/lib/postgresql/data
34+
location: /var/lib/pg/data
3535
- storage: logs
36-
location: /var/lib/postgresql/logs
36+
location: /var/lib/pg/logs
3737
- storage: temp
38-
location: /var/lib/postgresql/temp
38+
location: /var/lib/pg/temp
3939

4040
resources:
4141
postgresql-image:
4242
type: oci-image
4343
description: OCI image for PostgreSQL
44-
upstream-source: ghcr.io/canonical/charmed-postgresql@sha256:fcb648db2c418403fbf5f58f04892c4677f113526097ca7314cbd9f527bb1baf # renovate: oci-image tag: 16.11-24.04_edge
44+
upstream-source: ghcr.io/canonical/charmed-postgresql@sha256:840100acce8597fe94f1ba5a4d8599a50d4acc9b351d9ec146d9ebb6099e5aa9 # renovate: oci-image tag: 16.11-24.04_edge
4545

4646
peers:
4747
database-peers:
@@ -109,19 +109,19 @@ storage:
109109
archive:
110110
type: filesystem
111111
description: Storage mount used for holding local backups (before typically sending them to remote object storage) when relevant/needed.
112-
location: /var/lib/postgresql/archive
112+
location: /var/lib/pg/archive
113113
data:
114114
type: filesystem
115115
description: Storage mount used for storing all tables, indexes, and so on (except those from temporary tablespaces).
116-
location: /var/lib/postgresql/data
116+
location: /var/lib/pg/data
117117
logs:
118118
type: filesystem
119119
description: Storage mount used for storing all the logs that are part of the transactional commit path (WAL files).
120-
location: /var/lib/postgresql/logs
120+
location: /var/lib/pg/logs
121121
temp:
122122
type: filesystem
123123
description: Storage mount used for storing temporary tablespaces (where typically sort operations happen).
124-
location: /var/lib/postgresql/temp
124+
location: /var/lib/pg/temp
125125

126126
assumes:
127127
- k8s-api

scripts/authorisation_rules_observer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
PATRONI_CLUSTER_STATUS_ENDPOINT = "cluster"
2020
PATRONI_CONFIG_STATUS_ENDPOINT = "config"
2121
TLS_CA_BUNDLE_FILE = "peer_ca_bundle.pem"
22-
PATRONI_CONF_FILE_PATH = "/var/lib/postgresql/data/patroni.yml"
22+
PATRONI_CONF_FILE_PATH = "/var/lib/pg/data/patroni.yml"
2323

2424
# File path for the spawned cluster topology observer process to write logs.
2525
LOG_FILE_PATH = "/var/log/authorisation_rules_observer.log"

src/backups.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,13 @@
3131
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
3232

3333
from constants import (
34+
ARCHIVE_PATH,
3435
BACKUP_TYPE_OVERRIDES,
3536
BACKUP_USER,
37+
LOGS_STORAGE_PATH,
3638
PGBACKREST_LOGROTATE_FILE,
39+
POSTGRESQL_DATA_PATH,
40+
TEMP_STORAGE_PATH,
3741
WORKLOAD_OS_GROUP,
3842
WORKLOAD_OS_USER,
3943
)
@@ -195,7 +199,7 @@ def can_use_s3_repository(self) -> tuple[bool, str]:
195199

196200
system_identifier_from_instance, error = self._execute_command([
197201
f"/usr/lib/postgresql/{self.charm._patroni.rock_postgresql_version.split('.')[0]}/bin/pg_controldata",
198-
"/var/lib/postgresql/data/pgdata",
202+
POSTGRESQL_DATA_PATH,
199203
])
200204
if error != "":
201205
raise Exception(error)
@@ -272,15 +276,24 @@ def _create_bucket_if_not_exists(self) -> None:
272276

273277
def _empty_data_files(self) -> None:
274278
"""Empty the PostgreSQL data directory in preparation of backup restore."""
275-
try:
276-
self.container.exec(["rm", "-r", "/var/lib/postgresql/data/pgdata"]).wait_output()
277-
except ExecError as e:
278-
# If previous PITR restore was unsuccessful, there is no such directory.
279-
if "No such file or directory" not in str(e.stderr):
280-
logger.exception(
281-
"Failed to empty data directory in prep for backup restore", exc_info=e
282-
)
283-
raise
279+
# Clear all storage directories, not just data. The logs directory must be cleared
280+
# so that when new replicas join after restore, pg_basebackup can use the --waldir
281+
# option (which requires an empty directory).
282+
for path in [
283+
ARCHIVE_PATH,
284+
self.charm._actual_pgdata_path,
285+
LOGS_STORAGE_PATH,
286+
TEMP_STORAGE_PATH,
287+
]:
288+
try:
289+
self.container.exec(["find", path, "-mindepth", "1", "-delete"]).wait_output()
290+
except ExecError as e:
291+
# If previous PITR restore was unsuccessful, there may be no such directory.
292+
if "No such file or directory" not in str(e.stderr):
293+
logger.exception(
294+
f"Failed to empty {path} in prep for backup restore", exc_info=e
295+
)
296+
raise
284297

285298
def _change_connectivity_to_database(self, connectivity: bool) -> None:
286299
"""Enable or disable the connectivity to the database."""
@@ -1212,6 +1225,7 @@ def _render_pgbackrest_conf_file(self) -> bool:
12121225
secret_key=s3_parameters["secret-key"],
12131226
stanza=self.stanza_name,
12141227
storage_path=self.charm._storage_path,
1228+
pgdata_path=POSTGRESQL_DATA_PATH,
12151229
user=BACKUP_USER,
12161230
retention_full=s3_parameters["delete-older-than-days"],
12171231
process_max=max(cpu_count - 2, 1),
@@ -1234,7 +1248,9 @@ def _render_pgbackrest_conf_file(self) -> bool:
12341248
"/home/postgres/rotate_logs.py",
12351249
f.read(),
12361250
)
1237-
self.container.start(self.charm.rotate_logs_service)
1251+
services = self.container.pebble.get_services(names=[self.charm.rotate_logs_service])
1252+
if services:
1253+
self.container.start(self.charm.rotate_logs_service)
12381254

12391255
return True
12401256

src/charm.py

Lines changed: 117 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134
SECRET_KEY_OVERRIDES,
135135
SPI_MODULE,
136136
SYSTEM_USERS,
137+
TEMP_STORAGE_PATH,
137138
TLS_CA_BUNDLE_FILE,
138139
TLS_CA_FILE,
139140
TLS_CERT_FILE,
@@ -249,7 +250,8 @@ def __init__(self, *args):
249250

250251
self._certs_path = "/usr/local/share/ca-certificates"
251252
self._storage_path = str(self.meta.storages["data"].location)
252-
self.pgdata_path = f"{self._storage_path}/pgdata"
253+
self._actual_pgdata_path = f"{self._storage_path}/16/main"
254+
self.pgdata_path = "/var/lib/postgresql/16/main"
253255

254256
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
255257
self.postgresql_client_relation = PostgreSQLProvider(self)
@@ -1109,17 +1111,108 @@ def fix_leader_annotation(self) -> bool:
11091111
raise e
11101112
return True
11111113

1114+
def _replica_can_start(self) -> bool:
1115+
"""Check whether this replica is ready to start Patroni.
1116+
1117+
Returns False if the cluster hasn't been bootstrapped yet, or if the
1118+
leader hasn't added this unit's endpoint to the members list (which
1119+
controls the pg_hba replication entries on the primary).
1120+
"""
1121+
if not self.is_cluster_initialised:
1122+
logger.debug("Replica not ready: cluster not initialized")
1123+
return False
1124+
if self._endpoint not in self._endpoints:
1125+
logger.debug("Replica not ready: endpoint not yet in members list")
1126+
return False
1127+
return True
1128+
11121129
def _create_pgdata(self, container: Container):
1113-
"""Create the PostgreSQL data directory."""
1114-
if not container.exists(self.pgdata_path):
1130+
"""Create the PostgreSQL data directories."""
1131+
logs_path = str(self.meta.storages["logs"].location)
1132+
waldir_path = f"{logs_path}/16/main/pg_wal"
1133+
temp_path = str(self.meta.storages["temp"].location)
1134+
temp_tablespace_path = f"{temp_path}/16/main/pgsql_tmp"
1135+
1136+
# Clear stale storage directories when a replica joins an initialized cluster.
1137+
# This is needed because PersistentVolumes may retain data from previous pods,
1138+
# and pg_basebackup requires empty --waldir and --tablespace directories.
1139+
# Clear on every call until pgdata is populated (PG_VERSION exists), since
1140+
# pg_basebackup can fail and leave stale files that prevent retries.
1141+
pgdata_populated = container.exists(f"{self._actual_pgdata_path}/PG_VERSION")
1142+
if not self.unit.is_leader() and self.is_cluster_initialised and not pgdata_populated:
1143+
for path in [waldir_path, temp_tablespace_path]:
1144+
if container.exists(path):
1145+
try:
1146+
container.exec(["find", path, "-mindepth", "1", "-delete"]).wait_output()
1147+
logger.info(
1148+
f"Cleared stale content from {path} for replica initialization"
1149+
)
1150+
except ExecError as e:
1151+
if "No such file or directory" not in str(e.stderr):
1152+
logger.warning(f"Failed to clear {path}: {e}")
1153+
1154+
# Create the pgdata directory on the storage mount (e.g., /var/lib/pg/data/16/main)
1155+
if not container.exists(self._actual_pgdata_path):
1156+
container.make_dir(
1157+
self._actual_pgdata_path,
1158+
permissions=0o700,
1159+
user=WORKLOAD_OS_USER,
1160+
group=WORKLOAD_OS_GROUP,
1161+
make_parents=True,
1162+
)
1163+
# Create the WAL directory (e.g., /var/lib/pg/logs/16/main/pg_wal)
1164+
if not container.exists(waldir_path):
11151165
container.make_dir(
1116-
self.pgdata_path, permissions=0o700, user=WORKLOAD_OS_USER, group=WORKLOAD_OS_GROUP
1166+
waldir_path,
1167+
permissions=0o700,
1168+
user=WORKLOAD_OS_USER,
1169+
group=WORKLOAD_OS_GROUP,
1170+
make_parents=True,
11171171
)
1172+
# Create the temp tablespace directory (e.g., /var/lib/pg/temp/16/main/pgsql_tmp)
1173+
if not container.exists(temp_tablespace_path):
1174+
container.make_dir(
1175+
temp_tablespace_path,
1176+
permissions=0o700,
1177+
user=WORKLOAD_OS_USER,
1178+
group=WORKLOAD_OS_GROUP,
1179+
make_parents=True,
1180+
)
1181+
# Create a symlink from the default PostgreSQL data directory to our data directory
1182+
# (e.g., /var/lib/postgresql/16/main -> /var/lib/pg/data/16/main)
1183+
# Patroni and other tools will use the symlink path (self.pgdata_path)
1184+
# Note: This symlink is on ephemeral storage and may not persist across container restarts.
1185+
# It gets recreated on each pebble-ready event.
1186+
# The OCI image ships /var/lib/postgresql/16/main as a real directory, so we must
1187+
# remove it first if it exists as a non-symlink (e.g., on replicas).
1188+
container.make_dir(
1189+
"/var/lib/postgresql/16",
1190+
user=WORKLOAD_OS_USER,
1191+
group=WORKLOAD_OS_GROUP,
1192+
make_parents=True,
1193+
)
1194+
container.exec([
1195+
"bash",
1196+
"-c",
1197+
f"[ -L {self.pgdata_path} ] || rm -rf {self.pgdata_path}",
1198+
]).wait()
1199+
container.exec([
1200+
"ln",
1201+
"-sfn",
1202+
self._actual_pgdata_path,
1203+
self.pgdata_path,
1204+
]).wait()
1205+
container.exec([
1206+
"chown",
1207+
"-h",
1208+
f"{WORKLOAD_OS_USER}:{WORKLOAD_OS_GROUP}",
1209+
self.pgdata_path,
1210+
]).wait()
11181211
# Also, fix the permissions from the parent directory.
11191212
container.exec([
11201213
"chown",
11211214
f"{WORKLOAD_OS_USER}:{WORKLOAD_OS_GROUP}",
1122-
"/var/lib/postgresql/archive",
1215+
str(self.meta.storages["archive"].location),
11231216
]).wait()
11241217
container.exec([
11251218
"chown",
@@ -1129,12 +1222,12 @@ def _create_pgdata(self, container: Container):
11291222
container.exec([
11301223
"chown",
11311224
f"{WORKLOAD_OS_USER}:{WORKLOAD_OS_GROUP}",
1132-
"/var/lib/postgresql/logs",
1225+
logs_path,
11331226
]).wait()
11341227
container.exec([
11351228
"chown",
11361229
f"{WORKLOAD_OS_USER}:{WORKLOAD_OS_GROUP}",
1137-
"/var/lib/postgresql/temp",
1230+
temp_path,
11381231
]).wait()
11391232

11401233
def _on_start(self, _) -> None:
@@ -1170,15 +1263,14 @@ def _on_postgresql_pebble_ready(self, event: WorkloadEvent) -> None:
11701263
# where the volume is mounted with more restrictive permissions.
11711264
self._create_pgdata(container)
11721265

1173-
# Defer the initialization of the workload in the replicas
1174-
# if the cluster hasn't been bootstrap on the primary yet.
1175-
# Otherwise, each unit will create a different cluster and
1176-
# any update in the members list on the units won't have effect
1177-
# on fixing that.
1178-
if not self.unit.is_leader() and not self.is_cluster_initialised:
1179-
logger.debug(
1180-
"Deferring on_postgresql_pebble_ready: Not leader and cluster not initialized"
1181-
)
1266+
# Defer the initialization of the workload in the replicas if the cluster
1267+
# hasn't been bootstrapped on the primary yet, or the leader hasn't added
1268+
# this unit's endpoint to the cluster members list yet. The endpoint
1269+
# controls pg_hba replication entries on the primary — without it,
1270+
# pg_basebackup is rejected, triggering retries and remove_data_directory()
1271+
# calls that can race with _create_pgdata() and break the pg_wal symlink
1272+
# created by --waldir.
1273+
if not self.unit.is_leader() and not self._replica_can_start():
11821274
event.defer()
11831275
return
11841276

@@ -1337,7 +1429,7 @@ def _setup_users(self) -> None:
13371429
extra_user_roles=["pg_monitor"],
13381430
)
13391431

1340-
self.postgresql.set_up_database(temp_location="/var/lib/postgresql/temp")
1432+
self.postgresql.set_up_database(temp_location=f"{TEMP_STORAGE_PATH}/16/main/pgsql_tmp")
13411433

13421434
access_groups = self.postgresql.list_access_groups()
13431435
if access_groups != set(ACCESS_GROUPS):
@@ -1587,6 +1679,7 @@ def _fix_pod(self) -> None:
15871679
# Recreate k8s resources and add labels required for replication
15881680
# when the pod loses them (like when it's deleted).
15891681
self.push_tls_files_to_workload()
1682+
15901683
if self.refresh is not None and not self.refresh.in_progress:
15911684
try:
15921685
self._create_services()
@@ -1733,9 +1826,9 @@ def _on_update_status_early_exit_checks(self, container) -> bool:
17331826
def _check_pgdata_storage_size(self) -> None:
17341827
"""Asserts that pgdata volume has at least 10% free space and blocks charm if not."""
17351828
try:
1736-
total_size, _, free_size = shutil.disk_usage(self.pgdata_path)
1829+
total_size, _, free_size = shutil.disk_usage(self._actual_pgdata_path)
17371830
except FileNotFoundError:
1738-
logger.error("pgdata folder not found in %s", self.pgdata_path)
1831+
logger.error("pgdata folder not found in %s", self._actual_pgdata_path)
17391832
return
17401833

17411834
logger.debug(
@@ -1870,6 +1963,7 @@ def _patroni(self):
18701963
self.primary_endpoint,
18711964
self._namespace,
18721965
self._storage_path,
1966+
self.pgdata_path,
18731967
self.get_secret(APP_SCOPE, USER_PASSWORD_KEY),
18741968
self.get_secret(APP_SCOPE, REPLICATION_PASSWORD_KEY),
18751969
self.get_secret(APP_SCOPE, REWIND_PASSWORD_KEY),
@@ -2582,7 +2676,10 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
25822676
logger.warning("Early exit update_config: Unable to patch Patroni API")
25832677
return False
25842678

2585-
self._patroni.ensure_slots_controller_by_patroni(replication_slots)
2679+
if not self._patroni.ensure_slots_controller_by_patroni(replication_slots):
2680+
logger.warning(
2681+
"Failed to sync replication slots with Patroni — will retry on next config update"
2682+
)
25862683

25872684
self._handle_postgresql_restart_need(
25882685
self.unit_peer_data.get("config_hash") != self.generate_config_hash

src/constants.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,15 @@
1919
WORKLOAD_OS_USER = "postgres"
2020
METRICS_PORT = "9187"
2121
PGBACKREST_METRICS_PORT = "9854"
22-
POSTGRESQL_DATA_PATH = "/var/lib/postgresql/data/pgdata"
22+
POSTGRESQL_DATA_PATH = "/var/lib/postgresql/16/main"
2323
POSTGRESQL_LOGS_PATH = "/var/log/postgresql"
24+
25+
# Storage mount paths (must match metadata.yaml storage locations).
26+
STORAGE_PATH = "/var/lib/pg"
27+
ARCHIVE_PATH = f"{STORAGE_PATH}/archive"
28+
DATA_STORAGE_PATH = f"{STORAGE_PATH}/data"
29+
LOGS_STORAGE_PATH = f"{STORAGE_PATH}/logs"
30+
TEMP_STORAGE_PATH = f"{STORAGE_PATH}/temp"
2431
POSTGRESQL_LOGS_PATTERN = "postgresql*.log"
2532
POSTGRES_LOG_FILES = [
2633
"/var/log/pgbackrest/*",

0 commit comments

Comments
 (0)