Skip to content
Merged
84 changes: 76 additions & 8 deletions src/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@
BACKUP_TYPE_OVERRIDES,
BACKUP_USER,
PATRONI_CONF_PATH,
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE,
PGBACKREST_BACKUP_ID_FORMAT,
PGBACKREST_CONF_PATH,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_EXECUTABLE,
PGBACKREST_LOG_LEVEL_STDERR,
PGBACKREST_LOGROTATE_FILE,
PGBACKREST_LOGS_PATH,
POSTGRESQL_DATA_PATH,
Expand Down Expand Up @@ -183,7 +185,13 @@ def can_use_s3_repository(self) -> tuple[bool, str]:

try:
return_code, stdout, stderr = self._execute_command(
[PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, "info", "--output=json"],
[
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
"info",
"--output=json",
],
timeout=30,
)
except TimeoutExpired as e:
Expand All @@ -195,7 +203,8 @@ def can_use_s3_repository(self) -> tuple[bool, str]:

else:
if return_code != 0:
logger.error(f"Failed to run pgbackrest: {stderr}")
extracted_error = self._extract_error_message(stderr)
logger.error(f"Failed to run pgbackrest: {extracted_error}")
return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE

for stanza in json.loads(stdout):
Expand Down Expand Up @@ -357,6 +366,38 @@ def _execute_command(
)
return process.returncode, process.stdout.decode(), process.stderr.decode()

@staticmethod
def _extract_error_message(stderr: str) -> str:
"""Extract key error message from pgBackRest stderr output.

Since we standardize all pgBackRest commands to use --log-level-stderr=warn,
all errors and warnings are consistently written to stderr. This makes error
extraction predictable and avoids potential log duplication issues.

Args:
stderr: Standard error from pgBackRest command containing errors/warnings.

Returns:
Extracted error message from stderr, prioritizing ERROR/WARN lines.
"""
if not stderr.strip():
return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"

# Extract lines with ERROR or WARN markers from pgBackRest stderr output
error_lines = []
for line in stderr.splitlines():
if "ERROR:" in line or "WARN:" in line:
# Clean up the line by removing debug prefixes like "P00 ERROR:"
cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip()
error_lines.append(cleaned)

# If we found error/warning lines, return them joined
if error_lines:
return "; ".join(error_lines)

# Otherwise return the last non-empty line from stderr
return stderr.strip().splitlines()[-1]

def _format_backup_list(self, backup_list) -> str:
"""Formats provided list of backups as a table."""
s3_parameters, _ = self._retrieve_s3_parameters()
Expand Down Expand Up @@ -401,11 +442,13 @@ def _generate_backup_list_output(self) -> str:
return_code, output, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
"info",
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
extracted_error = self._extract_error_message(stderr)
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")

backups = json.loads(output)[0]["backup"]
for backup in backups:
Expand Down Expand Up @@ -472,11 +515,13 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
return_code, output, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
"info",
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list backups with error: {stderr}")
extracted_error = self._extract_error_message(stderr)
raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")

repository_info = next(iter(json.loads(output)), None)

Expand Down Expand Up @@ -506,12 +551,14 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
return_code, output, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
"repo-ls",
"--recurse",
"--output=json",
])
if return_code != 0:
raise ListBackupsError(f"Failed to list repository with error: {stderr}")
extracted_error = self._extract_error_message(stderr)
raise ListBackupsError(f"Failed to list repository with error: {extracted_error}")

repository = json.loads(output).items()
if repository is None:
Expand Down Expand Up @@ -623,6 +670,7 @@ def _initialise_stanza(self, event: HookEvent) -> bool:
return_code, _, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
f"--stanza={self.stanza_name}",
"stanza-create",
])
Expand Down Expand Up @@ -681,12 +729,25 @@ def check_stanza(self) -> bool:
return_code, _, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
f"--stanza={self.stanza_name}",
"check",
])
if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE:
# Raise an error if the archive command timeouts, so the user has the possibility
# to fix network issues and call juju resolve to re-trigger the hook that calls
# this method.
extracted_error = self._extract_error_message(stderr)
logger.error(
f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
)
raise TimeoutError
if return_code != 0:
raise Exception(stderr)
self.charm._set_primary_status_message()
except TimeoutError as e:
# Re-raise to put charm in error state (not blocked), allowing juju resolve
raise e
except Exception:
# If the check command doesn't succeed, remove the stanza name
# and rollback the configuration.
Expand Down Expand Up @@ -735,13 +796,15 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
return False
return_code, _, stderr = self._execute_command([
PGBACKREST_EXECUTABLE,
PGBACKREST_LOG_LEVEL_STDERR,
"server-ping",
"--io-timeout=10",
self.charm.primary_endpoint,
])
if return_code != 0:
extracted_error = self._extract_error_message(stderr)
logger.warning(
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}"
f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}"
)
return return_code == 0

Expand Down Expand Up @@ -931,6 +994,7 @@ def _run_backup(
command = [
PGBACKREST_EXECUTABLE,
PGBACKREST_CONFIGURATION_FILE,
PGBACKREST_LOG_LEVEL_STDERR,
f"--stanza={self.stanza_name}",
"--log-level-console=debug",
f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}",
Expand Down Expand Up @@ -967,7 +1031,8 @@ def _run_backup(
f"backup/{self.stanza_name}/{backup_id}/backup.log",
s3_parameters,
)
error_message = f"Failed to backup PostgreSQL with error: {stderr}"
extracted_error = self._extract_error_message(stderr)
error_message = f"Failed to backup PostgreSQL with error: {extracted_error}"
logger.error(f"Backup failed: {error_message}")
event.fail(error_message)
else:
Expand Down Expand Up @@ -1134,7 +1199,10 @@ def _on_restore_action(self, event): # noqa: C901
timeout=10,
)
if return_code != 0:
error_message = f"Failed to remove previous cluster information with error: {stderr}"
extracted_error = self._extract_error_message(stderr)
error_message = (
f"Failed to remove previous cluster information with error: {extracted_error}"
)
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return
Expand Down
10 changes: 10 additions & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@

# Snap constants.
PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest"
# pgBackRest logging configuration
# We use stderr for all error/warning output to have a consistent, predictable error extraction
# mechanism. By default, pgBackRest uses stdout (console) for warnings, but we standardize on
# stderr to avoid potential log duplication and to make error handling more reliable.
# Reference: https://pgbackrest.org/configuration.html#section-log
PGBACKREST_LOG_LEVEL_STDERR = "--log-level-stderr=warn"
# pgBackRest error codes
PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = (
82 # Archive timeout - unable to archive WAL files within configured timeout period
)

SNAP_COMMON_PATH = "/var/snap/charmed-postgresql/common"
SNAP_CURRENT_PATH = "/var/snap/charmed-postgresql/current"
Expand Down
65 changes: 64 additions & 1 deletion tests/unit/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ops.testing import Harness
from tenacity import RetryError, wait_fixed

from backups import ListBackupsError
from backups import ListBackupsError, PostgreSQLBackups
from charm import PostgresqlOperatorCharm
from constants import PEER

Expand All @@ -37,6 +37,60 @@ def harness():
harness.cleanup()


def test_extract_error_message_with_error_in_stderr():
"""Test extracting error from stderr with ERROR marker."""
stderr = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed
HINT: are all available clusters in recovery?"""
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed"


def test_extract_error_message_with_plain_stderr():
"""Test extracting error from stderr when no ERROR marker."""
stderr = "Connection refused: cannot connect to S3"
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "Connection refused: cannot connect to S3"


def test_extract_error_message_with_warning():
"""Test extracting warning message from stderr."""
stderr = "P00 WARN: configuration issue detected"
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "WARN: configuration issue detected"


def test_extract_error_message_with_multiple_errors():
"""Test extracting multiple ERROR/WARN lines from stderr."""
stderr = """P00 ERROR: first error
P00 WARN: warning message
P00 ERROR: second error"""
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "ERROR: first error; WARN: warning message; ERROR: second error"


def test_extract_error_message_with_empty_output():
"""Test with empty stderr returns helpful message."""
result = PostgreSQLBackups._extract_error_message("")
assert (
result
== "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest"
)


def test_extract_error_message_fallback_to_stderr_last_line():
"""Test fallback to last line of stderr when no ERROR/WARN markers."""
stderr = "Line 1\nLine 2\nFinal error message"
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "Final error message"


def test_extract_error_message_cleans_debug_prefix():
"""Test that debug prefixes like 'P00 ERROR:' are cleaned up."""
stderr = "2025-11-07 07:21:11.120 P00 ERROR: test error message"
result = PostgreSQLBackups._extract_error_message(stderr)
assert result == "ERROR: test error message"


def test_stanza_name(harness):
assert (
harness.charm.backup.stanza_name
Expand Down Expand Up @@ -694,6 +748,7 @@ def test_initialise_stanza(harness):
stanza_creation_command = [
"charmed-postgresql.pgbackrest",
"--config=/var/snap/charmed-postgresql/current/etc/pgbackrest/pgbackrest.conf",
"--log-level-stderr=warn",
f"--stanza={harness.charm.backup.stanza_name}",
"stanza-create",
]
Expand Down Expand Up @@ -784,6 +839,14 @@ def test_check_stanza(harness):
FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
)

# Test when the failure in the stanza check is due to an archive timeout.
_execute_command.reset_mock()
_s3_initialization_set_failure.reset_mock()
_execute_command.return_value = (82, "", "fake stderr")
with pytest.raises(TimeoutError):
harness.charm.backup.check_stanza()
_s3_initialization_set_failure.assert_not_called()

_execute_command.reset_mock()
_s3_initialization_set_failure.reset_mock()
_execute_command.return_value = (0, "fake stdout", "")
Expand Down