Skip to content

Commit f6ff193

Browse files
committed
[DPE-9070] Handle pgBackRest archive timeout in check_stanza
Port archive timeout error handling (error code 82) from PR #1328 to allow users to fix network issues and retry with `juju resolve`. When archive operations timeout, the charm enters error state instead of blocked state, enabling recovery via juju resolve. Fixes #1346 Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
1 parent e139766 commit f6ff193

File tree

3 files changed

+26
-1
lines changed

3 files changed

+26
-1
lines changed

src/backups.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
BACKUP_TYPE_OVERRIDES,
3535
BACKUP_USER,
3636
PATRONI_CONF_PATH,
37+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE,
3738
PGBACKREST_BACKUP_ID_FORMAT,
3839
PGBACKREST_CONF_PATH,
3940
PGBACKREST_CONFIGURATION_FILE,
@@ -717,15 +718,27 @@ def check_stanza(self) -> bool:
717718
# for that or else the s3 initialization sequence will fail.
718719
for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True):
719720
with attempt:
720-
return_code, _, stderr = self._execute_command([
721+
return_code, stdout, stderr = self._execute_command([
721722
PGBACKREST_EXECUTABLE,
722723
PGBACKREST_CONFIGURATION_FILE,
723724
f"--stanza={self.stanza_name}",
724725
"check",
725726
])
727+
if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE:
728+
# Raise an error if the archive command timeouts, so the user has the possibility
729+
# to fix network issues and call juju resolve to re-trigger the hook that calls
730+
# this method.
731+
extracted_error = self._extract_error_message(stdout, stderr)
732+
logger.error(
733+
f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
734+
)
735+
raise TimeoutError
726736
if return_code != 0:
727737
raise Exception(stderr)
728738
self.charm._set_primary_status_message()
739+
except TimeoutError as e:
740+
# Re-raise to put charm in error state (not blocked), allowing juju resolve
741+
raise e
729742
except Exception:
730743
# If the check command doesn't succeed, remove the stanza name
731744
# and rollback the configuration.

src/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626

2727
# Snap constants.
2828
PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest"
29+
# pgBackRest error codes
30+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = (
31+
82 # Archive timeout - unable to archive WAL files within configured timeout period
32+
)
2933

3034
SNAP_COMMON_PATH = "/var/snap/charmed-postgresql/common"
3135
SNAP_CURRENT_PATH = "/var/snap/charmed-postgresql/current"

tests/unit/test_backups.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,14 @@ def test_check_stanza(harness):
860860
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
861861
)
862862

863+
# Test when the failure in the stanza check is due to an archive timeout.
864+
_execute_command.reset_mock()
865+
_s3_initialization_set_failure.reset_mock()
866+
_execute_command.return_value = (82, "", "fake stderr")
867+
with pytest.raises(TimeoutError):
868+
harness.charm.backup.check_stanza()
869+
_s3_initialization_set_failure.assert_not_called()
870+
863871
_execute_command.reset_mock()
864872
_s3_initialization_set_failure.reset_mock()
865873
_execute_command.return_value = (0, "fake stdout", "")

0 commit comments

Comments
 (0)