Skip to content

Commit 2c3926f

Browse files
[DPE-2582] Point In Time Recovery (#391)
* Add restore-to-time parameter. * Add PITR restore test. * Fix patroni failed PITR check. * Add Patroni service restart condition override with PITR. * Improve restore-to-time parameter processing. * Improve PITR comments. * Improve PITR comments. * Fix unit tests errors caused by PITR. * Improve unit tests with PITR. * Fix PITR. * Improve PITR test. * Fix PITR, apply format. * Add PITR unit test. * Fix PITR integration test. * Format. * Add format check for restore-to-time parameter. Improve PITR fail detection. * Typo fix. * Add ability to restore only with 'restore-to-time' parameter. * Add requiring to move to another bucket after restore. * Add last transaction time logging on PITR fail. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix unit tests due to PITR. * Fix PITR integration test. * Improve restore-to-time input format. * Lint. * PITR PR suggestions Co-authored-by: Marcelo Henrique Neppel <[email protected]> * PITR PR suggestions * PITR, improve Patroni restart condition overriding. * Lint. * PITR, s3 stanza wal check. * Add restore-to-time "latest" option. * Fix unit tests. * Fix unit tests. * Fix tls vars naming in integration test_backups. * PITR improvements and fixes. Improve integration backup test. * Format. * Fix backups integration test. * Increase timeout for pitr test. * Fix library Signed-off-by: Marcelo Henrique Neppel <[email protected]> * Minor PITR test improvement. * Lint. --------- Signed-off-by: Marcelo Henrique Neppel <[email protected]> Co-authored-by: Marcelo Henrique Neppel <[email protected]> Co-authored-by: Marcelo Henrique Neppel <[email protected]>
1 parent 51832eb commit 2c3926f

File tree

12 files changed

+782
-40
lines changed

12 files changed

+782
-40
lines changed

actions.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ restore:
4545
backup-id:
4646
type: string
4747
description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ)
48+
restore-to-time:
49+
type: string
50+
description: Point-in-time-recovery target in PSQL format.
4851
set-password:
4952
description: Change the system user's password, which is used by charm.
5053
It is for internal charm users and SHOULD NOT be used by applications.

lib/charms/postgresql_k8s/v0/postgresql.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737
# Increment this PATCH version before using `charmcraft publish-lib` or reset
3838
# to 0 if you are raising the major API version
39-
LIBPATCH = 29
39+
LIBPATCH = 30
4040

4141
INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles"
4242

@@ -383,6 +383,16 @@ def _generate_database_privileges_statements(
383383
)
384384
return statements
385385

386+
def get_last_archived_wal(self) -> str:
387+
"""Get the name of the last archived wal for the current PostgreSQL cluster."""
388+
try:
389+
with self._connect_to_database() as connection, connection.cursor() as cursor:
390+
cursor.execute("SELECT last_archived_wal FROM pg_stat_archiver;")
391+
return cursor.fetchone()[0]
392+
except psycopg2.Error as e:
393+
logger.error(f"Failed to get PostgreSQL last archived WAL: {e}")
394+
raise PostgreSQLGetPostgreSQLVersionError()
395+
386396
def get_postgresql_text_search_configs(self) -> Set[str]:
387397
"""Returns the PostgreSQL available text search configs.
388398

src/backups.py

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,14 @@
4747
"failed to access/create the bucket, check your S3 settings"
4848
)
4949
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE = "failed to initialize stanza, check your S3 settings"
50+
CANNOT_RESTORE_PITR = "cannot restore PITR, juju debug-log for details"
51+
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET = "Move restored cluster to another S3 bucket"
5052

5153
S3_BLOCK_MESSAGES = [
5254
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
5355
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
5456
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
57+
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
5558
]
5659

5760

@@ -198,9 +201,29 @@ def can_use_s3_repository(self) -> Tuple[bool, Optional[str]]:
198201
if self.charm._patroni.member_started:
199202
self.charm._patroni.reload_patroni_configuration()
200203
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
204+
return self._is_s3_wal_compatible(stanza)
201205

202206
return True, None
203207

208+
def _is_s3_wal_compatible(self, stanza) -> Tuple[bool, Optional[str]]:
209+
"""Returns whether the S3 stanza is compatible with current PostgreSQL cluster by WAL parity."""
210+
charm_last_archived_wal = self.charm.postgresql.get_last_archived_wal()
211+
logger.debug(f"last archived wal: {charm_last_archived_wal}")
212+
s3_archive = stanza.get("archive", [])
213+
if len(s3_archive) > 0:
214+
s3_last_archived_wal = s3_archive[0].get("max")
215+
logger.debug(f"last s3 wal: {str(s3_last_archived_wal)}")
216+
if (
217+
charm_last_archived_wal
218+
and s3_last_archived_wal
219+
and charm_last_archived_wal.split(".", 1)[0] != str(s3_last_archived_wal)
220+
):
221+
if bool(self.charm.app_peer_data.get("require-change-bucket-after-restore", None)):
222+
return False, MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
223+
else:
224+
return False, ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
225+
return True, None
226+
204227
def _change_connectivity_to_database(self, connectivity: bool) -> None:
205228
"""Enable or disable the connectivity to the database."""
206229
self.charm.unit_peer_data.update({"connectivity": "on" if connectivity else "off"})
@@ -423,11 +446,7 @@ def _initialise_stanza(self) -> None:
423446

424447
# Enable stanza initialisation if the backup settings were fixed after being invalid
425448
# or pointing to a repository where there are backups from another cluster.
426-
if self.charm.is_blocked and self.charm.unit.status.message not in [
427-
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
428-
FAILED_TO_ACCESS_CREATE_BUCKET_ERROR_MESSAGE,
429-
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE,
430-
]:
449+
if self.charm.is_blocked and self.charm.unit.status.message not in S3_BLOCK_MESSAGES:
431450
logger.warning("couldn't initialize stanza due to a blocked status")
432451
return
433452

@@ -554,6 +573,18 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
554573
event.defer()
555574
return
556575

576+
# Prevents config change in bad state, so DB peer relations change event will not cause patroni related errors.
577+
if self.charm.unit.status.message == CANNOT_RESTORE_PITR:
578+
logger.info("Cannot change S3 configuration in bad PITR restore status")
579+
event.defer()
580+
return
581+
582+
# Prevents S3 change in the middle of restoring backup and patroni / pgbackrest errors caused by that.
583+
if "restoring-backup" in self.charm.app_peer_data:
584+
logger.info("Cannot change S3 configuration during restore")
585+
event.defer()
586+
return
587+
557588
if not self._render_pgbackrest_conf_file():
558589
logger.debug("Cannot set pgBackRest configurations, missing configurations.")
559590
return
@@ -567,6 +598,8 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
567598
if not self.charm.is_primary:
568599
return
569600

601+
self.charm.app_peer_data.pop("require-change-bucket-after-restore", None)
602+
570603
try:
571604
self._create_bucket_if_not_exists()
572605
except (ClientError, ValueError):
@@ -582,7 +615,11 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
582615

583616
def _on_s3_credential_gone(self, _) -> None:
584617
if self.charm.unit.is_leader():
585-
self.charm.app_peer_data.update({"stanza": "", "init-pgbackrest": ""})
618+
self.charm.app_peer_data.update({
619+
"stanza": "",
620+
"init-pgbackrest": "",
621+
"require-change-bucket-after-restore": "",
622+
})
586623
self.charm.unit_peer_data.update({"stanza": "", "init-pgbackrest": ""})
587624
if self.charm.is_blocked and self.charm.unit.status.message in S3_BLOCK_MESSAGES:
588625
self.charm.unit.status = ActiveStatus()
@@ -753,20 +790,42 @@ def _on_restore_action(self, event):
753790
return
754791

755792
backup_id = event.params.get("backup-id")
756-
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
793+
restore_to_time = event.params.get("restore-to-time")
794+
logger.info(
795+
f"A restore"
796+
f"{' with backup-id ' + backup_id if backup_id else ''}"
797+
f"{' to time point ' + restore_to_time if restore_to_time else ''}"
798+
f" has been requested on the unit"
799+
)
757800

758-
# Validate the provided backup id.
759-
logger.info("Validating provided backup-id")
801+
# Validate the provided backup id and restore to time.
802+
logger.info("Validating provided backup-id and restore-to-time")
760803
try:
761804
backups = self._list_backups(show_failed=False)
762-
if backup_id not in backups.keys():
805+
if backup_id and backup_id not in backups.keys():
763806
error_message = f"Invalid backup-id: {backup_id}"
764807
logger.error(f"Restore failed: {error_message}")
765808
event.fail(error_message)
766809
return
810+
if not backup_id and restore_to_time and not backups:
811+
error_message = "Cannot restore PITR without any backups created"
812+
logger.error(f"Restore failed: {error_message}")
813+
event.fail(error_message)
814+
return
767815
except ListBackupsError as e:
768816
logger.exception(e)
769-
error_message = "Failed to retrieve backup id"
817+
error_message = "Failed to retrieve backups list"
818+
logger.error(f"Restore failed: {error_message}")
819+
event.fail(error_message)
820+
return
821+
822+
# Quick check for timestamp format
823+
if (
824+
restore_to_time
825+
and restore_to_time != "latest"
826+
and not re.match("^[0-9-]+ [0-9:.+]+$", restore_to_time)
827+
):
828+
error_message = "Bad restore-to-time format"
770829
logger.error(f"Restore failed: {error_message}")
771830
event.fail(error_message)
772831
return
@@ -781,6 +840,17 @@ def _on_restore_action(self, event):
781840
event.fail(error_message)
782841
return
783842

843+
# Temporarily disabling patroni service auto-restart. This is required as point-in-time-recovery can fail
844+
# on restore, therefore during cluster bootstrapping process. In this case, we need be able to check patroni
845+
# service status and logs. Disabling auto-restart feature is essential to prevent wrong status indicated
846+
# and logs reading race condition (as logs cleared / moved with service restarts).
847+
if not self.charm.override_patroni_restart_condition("no", "restore-backup"):
848+
error_message = "Failed to override Patroni restart condition"
849+
logger.error(f"Restore failed: {error_message}")
850+
event.fail(error_message)
851+
self._restart_database()
852+
return
853+
784854
logger.info("Removing the contents of the data directory")
785855
if not self._empty_data_files():
786856
error_message = "Failed to remove contents of the data directory"
@@ -792,8 +862,12 @@ def _on_restore_action(self, event):
792862
# Mark the cluster as in a restoring backup state and update the Patroni configuration.
793863
logger.info("Configuring Patroni to restore the backup")
794864
self.charm.app_peer_data.update({
795-
"restoring-backup": self._fetch_backup_from_id(backup_id),
796-
"restore-stanza": backups[backup_id],
865+
"restoring-backup": self._fetch_backup_from_id(backup_id) if backup_id else "",
866+
"restore-stanza": backups[backup_id]
867+
if backup_id
868+
else self.charm.app_peer_data.get("stanza", self.stanza_name),
869+
"restore-to-time": restore_to_time or "",
870+
"require-change-bucket-after-restore": "True",
797871
})
798872
self.charm.update_config()
799873

@@ -865,17 +939,20 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
865939
event.fail(validation_message)
866940
return False
867941

868-
if not event.params.get("backup-id"):
869-
error_message = "Missing backup-id to restore"
942+
if not event.params.get("backup-id") and not event.params.get("restore-to-time"):
943+
error_message = (
944+
"Missing backup-id or/and restore-to-time parameter to be able to do restore"
945+
)
870946
logger.error(f"Restore failed: {error_message}")
871947
event.fail(error_message)
872948
return False
873949

874950
logger.info("Checking if cluster is in blocked state")
875-
if (
876-
self.charm.is_blocked
877-
and self.charm.unit.status.message != ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE
878-
):
951+
if self.charm.is_blocked and self.charm.unit.status.message not in [
952+
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
953+
CANNOT_RESTORE_PITR,
954+
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
955+
]:
879956
error_message = "Cluster or unit is in a blocking state"
880957
logger.error(f"Restore failed: {error_message}")
881958
event.fail(error_message)
@@ -941,7 +1018,7 @@ def _render_pgbackrest_conf_file(self) -> bool:
9411018

9421019
def _restart_database(self) -> None:
9431020
"""Removes the restoring backup flag and restart the database."""
944-
self.charm.app_peer_data.update({"restoring-backup": ""})
1021+
self.charm.app_peer_data.update({"restoring-backup": "", "restore-to-time": ""})
9451022
self.charm.update_config()
9461023
self.charm._patroni.start_patroni()
9471024

0 commit comments

Comments
 (0)