Skip to content

Commit 8d45417

Browse files
[DPE-6762] Catch archive command timeout (#1328)
* Catch archive command timeout Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Extract warning and error messages from pgBackRest logs Sometimes the error message is sent to the stdout instead of the stderr, so it's important to check both Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Refactor pgBackRest error handling for clarity - Replace magic number 82 with PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE constant - Use semicolon separator for multiple error messages (improved readability) - Document TimeoutError handling to preserve error state for juju resolve - Update corresponding unit tests Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Update src/backups.py Co-authored-by: Carl Csaposs <carl.csaposs@canonical.com> Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@gmail.com> --------- Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@gmail.com> Co-authored-by: Carl Csaposs <carl.csaposs@canonical.com>
1 parent e2181f8 commit 8d45417

File tree

3 files changed

+157
-11
lines changed

3 files changed

+157
-11
lines changed

src/backups.py

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
BACKUP_TYPE_OVERRIDES,
3737
BACKUP_USER,
3838
PATRONI_CONF_PATH,
39+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE,
3940
PGBACKREST_BACKUP_ID_FORMAT,
4041
PGBACKREST_CONF_PATH,
4142
PGBACKREST_CONFIGURATION_FILE,
@@ -210,7 +211,8 @@ def can_use_s3_repository(self) -> tuple[bool, str | None]:
210211

211212
else:
212213
if return_code != 0:
213-
logger.error(f"Failed to run pgbackrest: {stderr}")
214+
extracted_error = self._extract_error_message(stdout, stderr)
215+
logger.error(f"Failed to run pgbackrest: {extracted_error}")
214216
return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
215217

216218
for stanza in json.loads(stdout):
@@ -369,6 +371,41 @@ def result():
369371
)
370372
return process.returncode, process.stdout.decode(), process.stderr.decode()
371373

374+
@staticmethod
375+
def _extract_error_message(stdout: str, stderr: str) -> str:
376+
"""Extract key error message from pgBackRest output.
377+
378+
Args:
379+
stdout: Standard output from pgBackRest command.
380+
stderr: Standard error from pgBackRest command.
381+
382+
Returns:
383+
Extracted error message, prioritizing ERROR/WARN lines from output.
384+
"""
385+
combined_output = f"{stdout}\n{stderr}".strip()
386+
if not combined_output:
387+
return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"
388+
389+
# Extract lines with ERROR or WARN markers from pgBackRest output
390+
error_lines = []
391+
for line in combined_output.splitlines():
392+
if "ERROR:" in line or "WARN:" in line:
393+
# Clean up the line by removing debug prefixes like "P00 ERROR:"
394+
cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip()
395+
error_lines.append(cleaned)
396+
397+
# If we found error/warning lines, return them joined
398+
if error_lines:
399+
return "; ".join(error_lines)
400+
401+
# Otherwise return the last non-empty line from stderr or stdout
402+
if stderr.strip():
403+
return stderr.strip().splitlines()[-1]
404+
if stdout.strip():
405+
return stdout.strip().splitlines()[-1]
406+
407+
return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"
408+
372409
def _format_backup_list(self, backup_list) -> str:
373410
"""Formats provided list of backups as a table."""
374411
s3_parameters, _ = self._retrieve_s3_parameters()
@@ -417,7 +454,8 @@ def _generate_backup_list_output(self) -> str:
417454
"--output=json",
418455
])
419456
if return_code != 0:
420-
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
457+
extracted_error = self._extract_error_message(output, stderr)
458+
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
421459

422460
backups = json.loads(output)[0]["backup"]
423461
for backup in backups:
@@ -490,7 +528,8 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
490528
"--output=json",
491529
])
492530
if return_code != 0:
493-
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
531+
extracted_error = self._extract_error_message(output, stderr)
532+
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
494533

495534
repository_info = next(iter(json.loads(output)), None)
496535

@@ -525,7 +564,8 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
525564
"--output=json",
526565
])
527566
if return_code != 0:
528-
raise ListBackupsError(f"Failed to list repository with error: {stderr}")
567+
extracted_error = self._extract_error_message(output, stderr)
568+
raise ListBackupsError(f"Failed to list repository with error: {extracted_error}")
529569

530570
repository = json.loads(output).items()
531571
if repository is None:
@@ -692,15 +732,27 @@ def check_stanza(self) -> bool:
692732
# for that or else the s3 initialization sequence will fail.
693733
for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True):
694734
with attempt:
695-
return_code, _, stderr = self._execute_command([
735+
return_code, stdout, stderr = self._execute_command([
696736
PGBACKREST_EXECUTABLE,
697737
PGBACKREST_CONFIGURATION_FILE,
698738
f"--stanza={self.stanza_name}",
699739
"check",
700740
])
741+
if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE:
742+
# Raise an error if the archive command timeouts, so the user has the possibility
743+
# to fix network issues and call juju resolve to re-trigger the hook that calls
744+
# this method.
745+
extracted_error = self._extract_error_message(stdout, stderr)
746+
logger.error(
747+
f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
748+
)
749+
raise TimeoutError
701750
if return_code != 0:
702751
raise Exception(stderr)
703752
self.charm._set_primary_status_message()
753+
except TimeoutError:
754+
# Re-raise to put charm in error state (not blocked), allowing juju resolve
755+
raise
704756
except Exception as e:
705757
# If the check command doesn't succeed, remove the stanza name
706758
# and rollback the configuration.
@@ -747,15 +799,16 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
747799
if not self.charm.primary_endpoint:
748800
logger.warning("Failed to contact pgBackRest TLS server: no primary endpoint")
749801
return False
750-
return_code, _, stderr = self._execute_command([
802+
return_code, stdout, stderr = self._execute_command([
751803
PGBACKREST_EXECUTABLE,
752804
"server-ping",
753805
"--io-timeout=10",
754806
self.charm.primary_endpoint,
755807
])
756808
if return_code != 0:
809+
extracted_error = self._extract_error_message(stdout, stderr)
757810
logger.warning(
758-
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}"
811+
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}"
759812
)
760813
return return_code == 0
761814

@@ -970,7 +1023,8 @@ def _run_backup(
9701023
f"backup/{self.stanza_name}/{backup_id}/backup.log",
9711024
s3_parameters,
9721025
)
973-
error_message = f"Failed to backup PostgreSQL with error: {stderr}"
1026+
extracted_error = self._extract_error_message(stdout, stderr)
1027+
error_message = f"Failed to backup PostgreSQL with error: {extracted_error}"
9741028
logger.error(f"Backup failed: {error_message}")
9751029
event.fail(error_message)
9761030
else:
@@ -1125,7 +1179,7 @@ def _on_restore_action(self, event): # noqa: C901
11251179

11261180
# Remove previous cluster information to make it possible to initialise a new cluster.
11271181
logger.info("Removing previous cluster information")
1128-
return_code, _, stderr = self._execute_command(
1182+
return_code, stdout, stderr = self._execute_command(
11291183
[
11301184
"charmed-postgresql.patronictl",
11311185
"-c",
@@ -1137,7 +1191,10 @@ def _on_restore_action(self, event): # noqa: C901
11371191
timeout=10,
11381192
)
11391193
if return_code != 0:
1140-
error_message = f"Failed to remove previous cluster information with error: {stderr}"
1194+
extracted_error = self._extract_error_message(stdout, stderr)
1195+
error_message = (
1196+
f"Failed to remove previous cluster information with error: {extracted_error}"
1197+
)
11411198
logger.error(f"Restore failed: {error_message}")
11421199
event.fail(error_message)
11431200
return

src/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232

3333
# Snap constants.
3434
PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest"
35+
# pgBackRest error codes
36+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = (
37+
82 # Archive timeout - unable to archive WAL files within configured timeout period
38+
)
3539
POSTGRESQL_SNAP_NAME = "charmed-postgresql"
3640
SNAP_PACKAGES = [
3741
(

tests/unit/test_backups.py

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from ops.testing import Harness
1515
from tenacity import RetryError, wait_fixed
1616

17-
from backups import ListBackupsError
17+
from backups import ListBackupsError, PostgreSQLBackups
1818
from charm import PostgresqlOperatorCharm
1919
from constants import PEER
2020

@@ -38,6 +38,82 @@ def harness():
3838
harness.cleanup()
3939

4040

41+
def test_extract_error_message_with_error_in_stdout():
42+
"""Test extracting error from stdout with ERROR marker."""
43+
stdout = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed
44+
HINT: are all available clusters in recovery?"""
45+
stderr = ""
46+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
47+
assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed"
48+
49+
50+
def test_extract_error_message_with_error_in_stderr():
51+
"""Test extracting error from stderr when no ERROR marker."""
52+
stdout = ""
53+
stderr = "Connection refused: cannot connect to S3"
54+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
55+
assert result == "Connection refused: cannot connect to S3"
56+
57+
58+
def test_extract_error_message_with_both_stdout_and_stderr():
59+
"""Test extracting error when ERROR marker is in stdout and stderr has additional info."""
60+
stdout = "P00 ERROR: database error occurred"
61+
stderr = "Additional stderr info"
62+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
63+
assert result == "ERROR: database error occurred"
64+
65+
66+
def test_extract_error_message_with_warning():
67+
"""Test extracting warning message from output."""
68+
stdout = "P00 WARN: configuration issue detected"
69+
stderr = ""
70+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
71+
assert result == "WARN: configuration issue detected"
72+
73+
74+
def test_extract_error_message_with_multiple_errors():
75+
"""Test extracting multiple ERROR/WARN lines."""
76+
stdout = """P00 ERROR: first error
77+
P00 WARN: warning message
78+
P00 ERROR: second error"""
79+
stderr = ""
80+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
81+
assert result == "ERROR: first error; WARN: warning message; ERROR: second error"
82+
83+
84+
def test_extract_error_message_with_empty_output():
85+
"""Test with empty stdout and stderr returns helpful message."""
86+
result = PostgreSQLBackups._extract_error_message("", "")
87+
assert (
88+
result
89+
== "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest"
90+
)
91+
92+
93+
def test_extract_error_message_fallback_to_stdout_last_line():
94+
"""Test fallback to last line of stdout when no ERROR/WARN markers."""
95+
stdout = "Some generic output\nMore details here\nFinal error message"
96+
stderr = ""
97+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
98+
assert result == "Final error message"
99+
100+
101+
def test_extract_error_message_fallback_to_stderr_last_line():
102+
"""Test fallback to last line of stderr when no ERROR/WARN markers and no stdout."""
103+
stdout = ""
104+
stderr = "Line 1\nLine 2\nFinal error message"
105+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
106+
assert result == "Final error message"
107+
108+
109+
def test_extract_error_message_cleans_debug_prefix():
110+
"""Test that debug prefixes like 'P00 ERROR:' are cleaned up."""
111+
stdout = "2025-11-07 07:21:11.120 P00 ERROR: test error message"
112+
stderr = ""
113+
result = PostgreSQLBackups._extract_error_message(stdout, stderr)
114+
assert result == "ERROR: test error message"
115+
116+
41117
def test_stanza_name(harness):
42118
assert (
43119
harness.charm.backup.stanza_name
@@ -814,6 +890,15 @@ def test_check_stanza(harness):
814890
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
815891
)
816892

893+
# Test when the failure in the stanza check is due to an archive timeout.
894+
_execute_command.reset_mock()
895+
_s3_initialization_set_failure.reset_mock()
896+
_execute_command.return_value = (82, "", "fake stderr")
897+
with pytest.raises(TimeoutError):
898+
harness.charm.backup.check_stanza()
899+
assert False
900+
_s3_initialization_set_failure.assert_not_called()
901+
817902
_execute_command.reset_mock()
818903
_s3_initialization_set_failure.reset_mock()
819904
_execute_command.return_value = (0, "fake stdout", "")

0 commit comments

Comments
 (0)