Skip to content

Commit dad1b02

Browse files
[DPE-8865] Extract error messages (#1320)
* Extract warning and error messages from pgBackRest logs Sometimes the error message is sent to the stdout instead of the stderr, so it's important to check both Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Use semicolon separator for pgBackRest error messages Concatenate multiple ERROR/WARN lines with "; " instead of " " for improved readability when debugging backup issues. Updated unit test to match new separator format. Cherry-picked from #1328 (commit 1f1b601). Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * [DPE-9070] Handle pgBackRest archive timeout in check_stanza (#1350) Port archive timeout error handling (error code 82) from PR #1328 to allow users to fix network issues and retry with `juju resolve`. When archive operations timeout, the charm enters error state instead of blocked state, enabling recovery via juju resolve. Fixes #1346 Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Standardize pgBackRest logging to use stderr consistently Addresses code review feedback from PR #1320 (review #3577653265) about potential log duplication when checking both stdout and stderr for errors. Changes: - Add PGBACKREST_LOG_LEVEL_STDERR constant (--log-level-stderr=warn) to ensure all pgBackRest commands output errors/warnings to stderr - Update all 9 pgBackRest command invocations to include the new flag - Simplify _extract_error_message() to only check stderr (removes stdout parameter entirely) - Update all 8 call sites and 7 unit tests to use new single-parameter API - Remove redundant test for stdout handling This approach eliminates the risk of log duplication while maintaining predictable error extraction. All errors and warnings now consistently appear in stderr, making error handling more reliable. Reference: https://pgbackrest.org/configuration.html#section-log Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Revert "Standardize pgBackRest logging to use stderr consistently" This reverts commit 13a9ac4. Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Reapply "Standardize pgBackRest logging to use stderr consistently" This reverts commit 1c3eb04. Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> * Fix test_initialise_stanza and cleanup unused variables - Add --log-level-stderr=warn to test_initialise_stanza expected command to match implementation from commit d3a46b6 - Replace unused stdout variables with '_' in check_stanza, _is_primary_pgbackrest_service_running, and _on_restore_action - Add --log-level-console=debug to backup command for detailed output Fixes assertion error in tests/unit/test_backups.py::test_initialise_stanza Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com> --------- Signed-off-by: Marcelo Henrique Neppel <marcelo.neppel@canonical.com>
1 parent 7d6c6c7 commit dad1b02

File tree

3 files changed

+150
-9
lines changed

3 files changed

+150
-9
lines changed

src/backups.py

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@
3434
BACKUP_TYPE_OVERRIDES,
3535
BACKUP_USER,
3636
PATRONI_CONF_PATH,
37+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE,
3738
PGBACKREST_BACKUP_ID_FORMAT,
3839
PGBACKREST_CONF_PATH,
3940
PGBACKREST_CONFIGURATION_FILE,
4041
PGBACKREST_EXECUTABLE,
42+
PGBACKREST_LOG_LEVEL_STDERR,
4143
PGBACKREST_LOGROTATE_FILE,
4244
PGBACKREST_LOGS_PATH,
4345
POSTGRESQL_DATA_PATH,
@@ -183,7 +185,13 @@ def can_use_s3_repository(self) -> tuple[bool, str]:
183185

184186
try:
185187
return_code, stdout, stderr = self._execute_command(
186-
[PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, "info", "--output=json"],
188+
[
189+
PGBACKREST_EXECUTABLE,
190+
PGBACKREST_CONFIGURATION_FILE,
191+
PGBACKREST_LOG_LEVEL_STDERR,
192+
"info",
193+
"--output=json",
194+
],
187195
timeout=30,
188196
)
189197
except TimeoutExpired as e:
@@ -195,7 +203,8 @@ def can_use_s3_repository(self) -> tuple[bool, str]:
195203

196204
else:
197205
if return_code != 0:
198-
logger.error(f"Failed to run pgbackrest: {stderr}")
206+
extracted_error = self._extract_error_message(stderr)
207+
logger.error(f"Failed to run pgbackrest: {extracted_error}")
199208
return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
200209

201210
for stanza in json.loads(stdout):
@@ -357,6 +366,38 @@ def _execute_command(
357366
)
358367
return process.returncode, process.stdout.decode(), process.stderr.decode()
359368

369+
@staticmethod
370+
def _extract_error_message(stderr: str) -> str:
371+
"""Extract key error message from pgBackRest stderr output.
372+
373+
Since we standardize all pgBackRest commands to use --log-level-stderr=warn,
374+
all errors and warnings are consistently written to stderr. This makes error
375+
extraction predictable and avoids potential log duplication issues.
376+
377+
Args:
378+
stderr: Standard error from pgBackRest command containing errors/warnings.
379+
380+
Returns:
381+
Extracted error message from stderr, prioritizing ERROR/WARN lines.
382+
"""
383+
if not stderr.strip():
384+
return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"
385+
386+
# Extract lines with ERROR or WARN markers from pgBackRest stderr output
387+
error_lines = []
388+
for line in stderr.splitlines():
389+
if "ERROR:" in line or "WARN:" in line:
390+
# Clean up the line by removing debug prefixes like "P00 ERROR:"
391+
cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip()
392+
error_lines.append(cleaned)
393+
394+
# If we found error/warning lines, return them joined
395+
if error_lines:
396+
return "; ".join(error_lines)
397+
398+
# Otherwise return the last non-empty line from stderr
399+
return stderr.strip().splitlines()[-1]
400+
360401
def _format_backup_list(self, backup_list) -> str:
361402
"""Formats provided list of backups as a table."""
362403
s3_parameters, _ = self._retrieve_s3_parameters()
@@ -401,11 +442,13 @@ def _generate_backup_list_output(self) -> str:
401442
return_code, output, stderr = self._execute_command([
402443
PGBACKREST_EXECUTABLE,
403444
PGBACKREST_CONFIGURATION_FILE,
445+
PGBACKREST_LOG_LEVEL_STDERR,
404446
"info",
405447
"--output=json",
406448
])
407449
if return_code != 0:
408-
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
450+
extracted_error = self._extract_error_message(stderr)
451+
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
409452

410453
backups = json.loads(output)[0]["backup"]
411454
for backup in backups:
@@ -472,11 +515,13 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
472515
return_code, output, stderr = self._execute_command([
473516
PGBACKREST_EXECUTABLE,
474517
PGBACKREST_CONFIGURATION_FILE,
518+
PGBACKREST_LOG_LEVEL_STDERR,
475519
"info",
476520
"--output=json",
477521
])
478522
if return_code != 0:
479-
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
523+
extracted_error = self._extract_error_message(stderr)
524+
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
480525

481526
repository_info = next(iter(json.loads(output)), None)
482527

@@ -506,12 +551,14 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
506551
return_code, output, stderr = self._execute_command([
507552
PGBACKREST_EXECUTABLE,
508553
PGBACKREST_CONFIGURATION_FILE,
554+
PGBACKREST_LOG_LEVEL_STDERR,
509555
"repo-ls",
510556
"--recurse",
511557
"--output=json",
512558
])
513559
if return_code != 0:
514-
raise ListBackupsError(f"Failed to list repository with error: {stderr}")
560+
extracted_error = self._extract_error_message(stderr)
561+
raise ListBackupsError(f"Failed to list repository with error: {extracted_error}")
515562

516563
repository = json.loads(output).items()
517564
if repository is None:
@@ -623,6 +670,7 @@ def _initialise_stanza(self, event: HookEvent) -> bool:
623670
return_code, _, stderr = self._execute_command([
624671
PGBACKREST_EXECUTABLE,
625672
PGBACKREST_CONFIGURATION_FILE,
673+
PGBACKREST_LOG_LEVEL_STDERR,
626674
f"--stanza={self.stanza_name}",
627675
"stanza-create",
628676
])
@@ -681,12 +729,25 @@ def check_stanza(self) -> bool:
681729
return_code, _, stderr = self._execute_command([
682730
PGBACKREST_EXECUTABLE,
683731
PGBACKREST_CONFIGURATION_FILE,
732+
PGBACKREST_LOG_LEVEL_STDERR,
684733
f"--stanza={self.stanza_name}",
685734
"check",
686735
])
736+
if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE:
737+
# Raise an error if the archive command timeouts, so the user has the possibility
738+
# to fix network issues and call juju resolve to re-trigger the hook that calls
739+
# this method.
740+
extracted_error = self._extract_error_message(stderr)
741+
logger.error(
742+
f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
743+
)
744+
raise TimeoutError
687745
if return_code != 0:
688746
raise Exception(stderr)
689747
self.charm._set_primary_status_message()
748+
except TimeoutError as e:
749+
# Re-raise to put charm in error state (not blocked), allowing juju resolve
750+
raise e
690751
except Exception:
691752
# If the check command doesn't succeed, remove the stanza name
692753
# and rollback the configuration.
@@ -735,13 +796,15 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
735796
return False
736797
return_code, _, stderr = self._execute_command([
737798
PGBACKREST_EXECUTABLE,
799+
PGBACKREST_LOG_LEVEL_STDERR,
738800
"server-ping",
739801
"--io-timeout=10",
740802
self.charm.primary_endpoint,
741803
])
742804
if return_code != 0:
805+
extracted_error = self._extract_error_message(stderr)
743806
logger.warning(
744-
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}"
807+
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}"
745808
)
746809
return return_code == 0
747810

@@ -931,6 +994,7 @@ def _run_backup(
931994
command = [
932995
PGBACKREST_EXECUTABLE,
933996
PGBACKREST_CONFIGURATION_FILE,
997+
PGBACKREST_LOG_LEVEL_STDERR,
934998
f"--stanza={self.stanza_name}",
935999
"--log-level-console=debug",
9361000
f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}",
@@ -967,7 +1031,8 @@ def _run_backup(
9671031
f"backup/{self.stanza_name}/{backup_id}/backup.log",
9681032
s3_parameters,
9691033
)
970-
error_message = f"Failed to backup PostgreSQL with error: {stderr}"
1034+
extracted_error = self._extract_error_message(stderr)
1035+
error_message = f"Failed to backup PostgreSQL with error: {extracted_error}"
9711036
logger.error(f"Backup failed: {error_message}")
9721037
event.fail(error_message)
9731038
else:
@@ -1134,7 +1199,10 @@ def _on_restore_action(self, event): # noqa: C901
11341199
timeout=10,
11351200
)
11361201
if return_code != 0:
1137-
error_message = f"Failed to remove previous cluster information with error: {stderr}"
1202+
extracted_error = self._extract_error_message(stderr)
1203+
error_message = (
1204+
f"Failed to remove previous cluster information with error: {extracted_error}"
1205+
)
11381206
logger.error(f"Restore failed: {error_message}")
11391207
event.fail(error_message)
11401208
return

src/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@
2626

2727
# Snap constants.
2828
PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest"
29+
# pgBackRest logging configuration
30+
# We use stderr for all error/warning output to have a consistent, predictable error extraction
31+
# mechanism. By default, pgBackRest uses stdout (console) for warnings, but we standardize on
32+
# stderr to avoid potential log duplication and to make error handling more reliable.
33+
# Reference: https://pgbackrest.org/configuration.html#section-log
34+
PGBACKREST_LOG_LEVEL_STDERR = "--log-level-stderr=warn"
35+
# pgBackRest error codes
36+
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = (
37+
82 # Archive timeout - unable to archive WAL files within configured timeout period
38+
)
2939

3040
SNAP_COMMON_PATH = "/var/snap/charmed-postgresql/common"
3141
SNAP_CURRENT_PATH = "/var/snap/charmed-postgresql/current"

tests/unit/test_backups.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from ops.testing import Harness
1414
from tenacity import RetryError, wait_fixed
1515

16-
from backups import ListBackupsError
16+
from backups import ListBackupsError, PostgreSQLBackups
1717
from charm import PostgresqlOperatorCharm
1818
from constants import PEER
1919

@@ -37,6 +37,60 @@ def harness():
3737
harness.cleanup()
3838

3939

40+
def test_extract_error_message_with_error_in_stderr():
41+
"""Test extracting error from stderr with ERROR marker."""
42+
stderr = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed
43+
HINT: are all available clusters in recovery?"""
44+
result = PostgreSQLBackups._extract_error_message(stderr)
45+
assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed"
46+
47+
48+
def test_extract_error_message_with_plain_stderr():
49+
"""Test extracting error from stderr when no ERROR marker."""
50+
stderr = "Connection refused: cannot connect to S3"
51+
result = PostgreSQLBackups._extract_error_message(stderr)
52+
assert result == "Connection refused: cannot connect to S3"
53+
54+
55+
def test_extract_error_message_with_warning():
56+
"""Test extracting warning message from stderr."""
57+
stderr = "P00 WARN: configuration issue detected"
58+
result = PostgreSQLBackups._extract_error_message(stderr)
59+
assert result == "WARN: configuration issue detected"
60+
61+
62+
def test_extract_error_message_with_multiple_errors():
63+
"""Test extracting multiple ERROR/WARN lines from stderr."""
64+
stderr = """P00 ERROR: first error
65+
P00 WARN: warning message
66+
P00 ERROR: second error"""
67+
result = PostgreSQLBackups._extract_error_message(stderr)
68+
assert result == "ERROR: first error; WARN: warning message; ERROR: second error"
69+
70+
71+
def test_extract_error_message_with_empty_output():
72+
"""Test with empty stderr returns helpful message."""
73+
result = PostgreSQLBackups._extract_error_message("")
74+
assert (
75+
result
76+
== "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest"
77+
)
78+
79+
80+
def test_extract_error_message_fallback_to_stderr_last_line():
81+
"""Test fallback to last line of stderr when no ERROR/WARN markers."""
82+
stderr = "Line 1\nLine 2\nFinal error message"
83+
result = PostgreSQLBackups._extract_error_message(stderr)
84+
assert result == "Final error message"
85+
86+
87+
def test_extract_error_message_cleans_debug_prefix():
88+
"""Test that debug prefixes like 'P00 ERROR:' are cleaned up."""
89+
stderr = "2025-11-07 07:21:11.120 P00 ERROR: test error message"
90+
result = PostgreSQLBackups._extract_error_message(stderr)
91+
assert result == "ERROR: test error message"
92+
93+
4094
def test_stanza_name(harness):
4195
assert (
4296
harness.charm.backup.stanza_name
@@ -694,6 +748,7 @@ def test_initialise_stanza(harness):
694748
stanza_creation_command = [
695749
"charmed-postgresql.pgbackrest",
696750
"--config=/var/snap/charmed-postgresql/current/etc/pgbackrest/pgbackrest.conf",
751+
"--log-level-stderr=warn",
697752
f"--stanza={harness.charm.backup.stanza_name}",
698753
"stanza-create",
699754
]
@@ -784,6 +839,14 @@ def test_check_stanza(harness):
784839
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
785840
)
786841

842+
# Test when the failure in the stanza check is due to an archive timeout.
843+
_execute_command.reset_mock()
844+
_s3_initialization_set_failure.reset_mock()
845+
_execute_command.return_value = (82, "", "fake stderr")
846+
with pytest.raises(TimeoutError):
847+
harness.charm.backup.check_stanza()
848+
_s3_initialization_set_failure.assert_not_called()
849+
787850
_execute_command.reset_mock()
788851
_s3_initialization_set_failure.reset_mock()
789852
_execute_command.return_value = (0, "fake stdout", "")

0 commit comments

Comments
 (0)