Skip to content

Commit a87c52b

Browse files
[DPE-3380] Handle S3 relation in primary non-leader unit (#340)
* Handle S3 relation in primary non-leader unit Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix replication after restore Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Fix unit name Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Speed up events Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Change fast_interval parameter Signed-off-by: Marcelo Henrique Neppel <[email protected]> --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]>
1 parent acc0d26 commit a87c52b

File tree

6 files changed

+422
-39
lines changed

6 files changed

+422
-39
lines changed

poetry.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/backups.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,14 @@ def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]:
102102

103103
tls_enabled = "tls" in self.charm.unit_peer_data
104104

105+
# Check if this unit is the primary (if it was not possible to retrieve that information,
106+
# then show that the unit cannot perform a backup, because possibly the database is offline).
107+
try:
108+
is_primary = self.charm.is_primary
109+
except RetryError:
110+
return False, "Unit cannot perform backups as the database seems to be offline"
111+
105112
# Only enable backups on primary if there are replicas but TLS is not enabled.
106-
is_primary = self.charm.unit.name == self.charm._patroni.get_primary(
107-
unit_name_pattern=True
108-
)
109113
if is_primary and self.charm.app.planned_units() > 1 and tls_enabled:
110114
return False, "Unit cannot perform backups as it is the cluster primary"
111115

@@ -363,7 +367,7 @@ def _initialise_stanza(self) -> None:
363367
located, how it will be backed up, archiving options, etc. (more info in
364368
https://pgbackrest.org/user-guide.html#quickstart/configure-stanza).
365369
"""
366-
if not self.charm.unit.is_leader():
370+
if not self.charm.is_primary:
367371
return
368372

369373
# Enable stanza initialisation if the backup settings were fixed after being invalid
@@ -403,11 +407,18 @@ def _initialise_stanza(self) -> None:
403407
self.start_stop_pgbackrest_service()
404408

405409
# Store the stanza name to be used in configurations updates.
406-
self.charm.app_peer_data.update({"stanza": self.stanza_name, "init-pgbackrest": "True"})
410+
if self.charm.unit.is_leader():
411+
self.charm.app_peer_data.update(
412+
{"stanza": self.stanza_name, "init-pgbackrest": "True"}
413+
)
414+
else:
415+
self.charm.unit_peer_data.update(
416+
{"stanza": self.stanza_name, "init-pgbackrest": "True"}
417+
)
407418

408419
def check_stanza(self) -> None:
409420
"""Runs the pgbackrest stanza validation."""
410-
if not self.charm.unit.is_leader() or "init-pgbackrest" not in self.charm.app_peer_data:
421+
if not self.charm.is_primary or "init-pgbackrest" not in self.charm.app_peer_data:
411422
return
412423

413424
# Update the configuration to use pgBackRest as the archiving mechanism.
@@ -437,12 +448,37 @@ def check_stanza(self) -> None:
437448
# and rollback the configuration.
438449
self.charm.app_peer_data.update({"stanza": ""})
439450
self.charm.app_peer_data.pop("init-pgbackrest", None)
451+
self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""})
440452
self.charm.update_config()
441453

442454
logger.exception(e)
443455
self.charm.unit.status = BlockedStatus(FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE)
444456

445-
self.charm.app_peer_data.pop("init-pgbackrest", None)
457+
if self.charm.unit.is_leader():
458+
self.charm.app_peer_data.pop("init-pgbackrest", None)
459+
self.charm.unit_peer_data.pop("init-pgbackrest", None)
460+
461+
def coordinate_stanza_fields(self) -> None:
462+
"""Coordinate the stanza name between the primary and the leader units."""
463+
for unit, unit_data in self.charm._peers.data.items():
464+
if "stanza" not in unit_data:
465+
continue
466+
# If the stanza name is not set in the application databag, then the primary is not
467+
# the leader unit, and it's needed to set the stanza name in the application databag.
468+
if "stanza" not in self.charm.app_peer_data and self.charm.unit.is_leader():
469+
self.charm.app_peer_data.update(
470+
{"stanza": self.stanza_name, "init-pgbackrest": "True"}
471+
)
472+
break
473+
# If the stanza was already checked and its name is still in the unit databag, mark
474+
# the stanza as already checked in the application databag and remove it from the
475+
# unit databag.
476+
if "init-pgbackrest" not in unit_data:
477+
if self.charm.unit.is_leader():
478+
self.charm.app_peer_data.pop("init-pgbackrest", None)
479+
if "init-pgbackrest" not in self.charm.app_peer_data and unit == self.charm.unit:
480+
self.charm.unit_peer_data.update({"stanza": ""})
481+
break
446482

447483
@property
448484
def _is_primary_pgbackrest_service_running(self) -> bool:
@@ -469,8 +505,8 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
469505
logger.debug("Cannot set pgBackRest configurations, missing configurations.")
470506
return
471507

472-
# Verify the s3 relation only on the leader
473-
if not self.charm.unit.is_leader():
508+
# Verify the s3 relation only on the primary.
509+
if not self.charm.is_primary:
474510
return
475511

476512
try:
@@ -487,6 +523,9 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
487523
self._initialise_stanza()
488524

489525
def _on_s3_credential_gone(self, _) -> None:
526+
if self.charm.unit.is_leader():
527+
self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""})
528+
self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""})
490529
if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES:
491530
self.charm.unit.status = ActiveStatus()
492531

src/charm.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,18 @@ def _on_peer_relation_changed(self, event: HookEvent):
477477
event.defer()
478478
return
479479

480+
# Restart the workload if it's stuck on the starting state after a timeline divergence
481+
# due to a backup that was restored.
482+
if not self.is_primary and (
483+
self._patroni.member_replication_lag == "unknown"
484+
or int(self._patroni.member_replication_lag) > 1000
485+
):
486+
self._patroni.reinitialize_postgresql()
487+
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
488+
self.unit.status = MaintenanceStatus("reinitialising replica")
489+
event.defer()
490+
return
491+
480492
# Start or stop the pgBackRest TLS server service when TLS certificate change.
481493
if not self.backup.start_stop_pgbackrest_service():
482494
logger.debug(
@@ -485,6 +497,8 @@ def _on_peer_relation_changed(self, event: HookEvent):
485497
event.defer()
486498
return
487499

500+
self.backup.coordinate_stanza_fields()
501+
488502
self.backup.check_stanza()
489503

490504
if "exporter-started" not in self.unit_peer_data:

tests/integration/test_backups.py

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
get_password,
1919
get_primary,
2020
get_unit_address,
21+
scale_application,
22+
switchover,
2123
wait_for_idle_on_blocked,
2224
)
2325
from .juju_ import juju_major_version
@@ -72,6 +74,7 @@ async def cloud_configs(ops_test: OpsTest, github_secrets) -> None:
7274
}
7375
yield configs, credentials
7476
# Delete the previously created objects.
77+
logger.info("deleting the previously created backups")
7578
for cloud, config in configs.items():
7679
session = boto3.session.Session(
7780
aws_access_key_id=credentials[cloud]["access-key"],
@@ -128,9 +131,10 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No
128131
**cloud_configs[1][cloud],
129132
)
130133
await action.wait()
131-
await ops_test.model.wait_for_idle(
132-
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000
133-
)
134+
async with ops_test.fast_forward(fast_interval="60s"):
135+
await ops_test.model.wait_for_idle(
136+
apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000
137+
)
134138

135139
primary = await get_primary(ops_test, f"{database_app_name}/0")
136140
for unit in ops_test.model.applications[database_app_name].units:
@@ -225,6 +229,61 @@ async def test_backup(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -> No
225229
], "backup wasn't correctly restored: table 'backup_table_2' exists"
226230
connection.close()
227231

232+
# Run the following steps only in one cloud (it's enough for those checks).
233+
if cloud == list(cloud_configs[0].keys())[0]:
234+
# Remove the relation to the TLS certificates operator.
235+
await ops_test.model.applications[database_app_name].remove_relation(
236+
f"{database_app_name}:certificates", f"{TLS_CERTIFICATES_APP_NAME}:certificates"
237+
)
238+
await ops_test.model.wait_for_idle(
239+
apps=[database_app_name], status="active", timeout=1000
240+
)
241+
242+
# Scale up to be able to test primary and leader being different.
243+
async with ops_test.fast_forward():
244+
await scale_application(ops_test, database_app_name, 2)
245+
246+
# Ensure replication is working correctly.
247+
new_unit_name = f"{database_app_name}/2"
248+
address = get_unit_address(ops_test, new_unit_name)
249+
with db_connect(
250+
host=address, password=password
251+
) as connection, connection.cursor() as cursor:
252+
cursor.execute(
253+
"SELECT EXISTS (SELECT FROM information_schema.tables"
254+
" WHERE table_schema = 'public' AND table_name = 'backup_table_1');"
255+
)
256+
assert cursor.fetchone()[
257+
0
258+
], f"replication isn't working correctly: table 'backup_table_1' doesn't exist in {new_unit_name}"
259+
cursor.execute(
260+
"SELECT EXISTS (SELECT FROM information_schema.tables"
261+
" WHERE table_schema = 'public' AND table_name = 'backup_table_2');"
262+
)
263+
assert not cursor.fetchone()[
264+
0
265+
], f"replication isn't working correctly: table 'backup_table_2' exists in {new_unit_name}"
266+
connection.close()
267+
268+
switchover(ops_test, primary, new_unit_name)
269+
270+
# Get the new primary unit.
271+
primary = await get_primary(ops_test, new_unit_name)
272+
# Check that the primary changed.
273+
for attempt in Retrying(
274+
stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=30)
275+
):
276+
with attempt:
277+
assert primary == new_unit_name
278+
279+
# Ensure stanza is working correctly.
280+
logger.info("listing the available backups")
281+
action = await ops_test.model.units.get(new_unit_name).run_action("list-backups")
282+
await action.wait()
283+
backups = action.results.get("backups")
284+
assert backups, "backups not outputted"
285+
await ops_test.model.wait_for_idle(status="active", timeout=1000)
286+
228287
# Remove the database app.
229288
await ops_test.model.remove_application(database_app_name, block_until_done=True)
230289
# Remove the TLS operator.

0 commit comments

Comments
 (0)