diff --git a/src/backups.py b/src/backups.py index ee67a00ab8f..3a83ffb1c75 100644 --- a/src/backups.py +++ b/src/backups.py @@ -34,10 +34,12 @@ BACKUP_TYPE_OVERRIDES, BACKUP_USER, PATRONI_CONF_PATH, + PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE, PGBACKREST_BACKUP_ID_FORMAT, PGBACKREST_CONF_PATH, PGBACKREST_CONFIGURATION_FILE, PGBACKREST_EXECUTABLE, + PGBACKREST_LOG_LEVEL_STDERR, PGBACKREST_LOGROTATE_FILE, PGBACKREST_LOGS_PATH, POSTGRESQL_DATA_PATH, @@ -183,7 +185,13 @@ def can_use_s3_repository(self) -> tuple[bool, str]: try: return_code, stdout, stderr = self._execute_command( - [PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, "info", "--output=json"], + [ + PGBACKREST_EXECUTABLE, + PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, + "info", + "--output=json", + ], timeout=30, ) except TimeoutExpired as e: @@ -195,7 +203,8 @@ def can_use_s3_repository(self) -> tuple[bool, str]: else: if return_code != 0: - logger.error(f"Failed to run pgbackrest: {stderr}") + extracted_error = self._extract_error_message(stderr) + logger.error(f"Failed to run pgbackrest: {extracted_error}") return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE for stanza in json.loads(stdout): @@ -357,6 +366,38 @@ def _execute_command( ) return process.returncode, process.stdout.decode(), process.stderr.decode() + @staticmethod + def _extract_error_message(stderr: str) -> str: + """Extract key error message from pgBackRest stderr output. + + Since we standardize all pgBackRest commands to use --log-level-stderr=warn, + all errors and warnings are consistently written to stderr. This makes error + extraction predictable and avoids potential log duplication issues. + + Args: + stderr: Standard error from pgBackRest command containing errors/warnings. + + Returns: + Extracted error message from stderr, prioritizing ERROR/WARN lines. + """ + if not stderr.strip(): + return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}" + + # Extract lines with ERROR or WARN markers from pgBackRest stderr output + error_lines = [] + for line in stderr.splitlines(): + if "ERROR:" in line or "WARN:" in line: + # Clean up the line by removing debug prefixes like "P00 ERROR:" + cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip() + error_lines.append(cleaned) + + # If we found error/warning lines, return them joined + if error_lines: + return "; ".join(error_lines) + + # Otherwise return the last non-empty line from stderr + return stderr.strip().splitlines()[-1] + def _format_backup_list(self, backup_list) -> str: """Formats provided list of backups as a table.""" s3_parameters, _ = self._retrieve_s3_parameters() @@ -401,11 +442,13 @@ def _generate_backup_list_output(self) -> str: return_code, output, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, "info", "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list backups with error: {stderr}") + extracted_error = self._extract_error_message(stderr) + raise ListBackupsError(f"Failed to list backups with error: {extracted_error}") backups = json.loads(output)[0]["backup"] for backup in backups: @@ -472,11 +515,13 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s return_code, output, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, "info", "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list backups with error: {stderr}") + extracted_error = self._extract_error_message(stderr) + raise ListBackupsError(f"Failed to list backups with error: {extracted_error}") repository_info = next(iter(json.loads(output)), None) @@ -506,12 +551,14 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]: return_code, output, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, "repo-ls", "--recurse", "--output=json", ]) if return_code != 0: - raise ListBackupsError(f"Failed to list repository with error: {stderr}") + extracted_error = self._extract_error_message(stderr) + raise ListBackupsError(f"Failed to list repository with error: {extracted_error}") repository = json.loads(output).items() if repository is None: @@ -623,6 +670,7 @@ def _initialise_stanza(self, event: HookEvent) -> bool: return_code, _, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, f"--stanza={self.stanza_name}", "stanza-create", ]) @@ -681,12 +729,25 @@ def check_stanza(self) -> bool: return_code, _, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, f"--stanza={self.stanza_name}", "check", ]) + if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE: + # Raise an error if the archive command timeouts, so the user has the possibility + # to fix network issues and call juju resolve to re-trigger the hook that calls + # this method. + extracted_error = self._extract_error_message(stderr) + logger.error( + f"error: {extracted_error} - please fix the error and call juju resolve on this unit" + ) + raise TimeoutError if return_code != 0: raise Exception(stderr) self.charm._set_primary_status_message() + except TimeoutError as e: + # Re-raise to put charm in error state (not blocked), allowing juju resolve + raise e except Exception: # If the check command doesn't succeed, remove the stanza name # and rollback the configuration. @@ -735,13 +796,15 @@ def _is_primary_pgbackrest_service_running(self) -> bool: return False return_code, _, stderr = self._execute_command([ PGBACKREST_EXECUTABLE, + PGBACKREST_LOG_LEVEL_STDERR, "server-ping", "--io-timeout=10", self.charm.primary_endpoint, ]) if return_code != 0: + extracted_error = self._extract_error_message(stderr) logger.warning( - f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}" + f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}" ) return return_code == 0 @@ -931,6 +994,7 @@ def _run_backup( command = [ PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, + PGBACKREST_LOG_LEVEL_STDERR, f"--stanza={self.stanza_name}", "--log-level-console=debug", f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}", @@ -967,7 +1031,8 @@ def _run_backup( f"backup/{self.stanza_name}/{backup_id}/backup.log", s3_parameters, ) - error_message = f"Failed to backup PostgreSQL with error: {stderr}" + extracted_error = self._extract_error_message(stderr) + error_message = f"Failed to backup PostgreSQL with error: {extracted_error}" logger.error(f"Backup failed: {error_message}") event.fail(error_message) else: @@ -1134,7 +1199,10 @@ def _on_restore_action(self, event): # noqa: C901 timeout=10, ) if return_code != 0: - error_message = f"Failed to remove previous cluster information with error: {stderr}" + extracted_error = self._extract_error_message(stderr) + error_message = ( + f"Failed to remove previous cluster information with error: {extracted_error}" + ) logger.error(f"Restore failed: {error_message}") event.fail(error_message) return diff --git a/src/constants.py b/src/constants.py index c6c3203c6c8..43f446a8821 100644 --- a/src/constants.py +++ b/src/constants.py @@ -26,6 +26,16 @@ # Snap constants. PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest" +# pgBackRest logging configuration +# We use stderr for all error/warning output to have a consistent, predictable error extraction +# mechanism. By default, pgBackRest uses stdout (console) for warnings, but we standardize on +# stderr to avoid potential log duplication and to make error handling more reliable. +# Reference: https://pgbackrest.org/configuration.html#section-log +PGBACKREST_LOG_LEVEL_STDERR = "--log-level-stderr=warn" +# pgBackRest error codes +PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = ( + 82 # Archive timeout - unable to archive WAL files within configured timeout period +) SNAP_COMMON_PATH = "/var/snap/charmed-postgresql/common" SNAP_CURRENT_PATH = "/var/snap/charmed-postgresql/current" diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 5a26476deb2..6c3fc9a91e9 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -13,7 +13,7 @@ from ops.testing import Harness from tenacity import RetryError, wait_fixed -from backups import ListBackupsError +from backups import ListBackupsError, PostgreSQLBackups from charm import PostgresqlOperatorCharm from constants import PEER @@ -37,6 +37,60 @@ def harness(): harness.cleanup() +def test_extract_error_message_with_error_in_stderr(): + """Test extracting error from stderr with ERROR marker.""" + stderr = """2025-11-07 07:21:11.120 P00 ERROR: [056]: unable to find primary cluster - cannot proceed + HINT: are all available clusters in recovery?""" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed" + + +def test_extract_error_message_with_plain_stderr(): + """Test extracting error from stderr when no ERROR marker.""" + stderr = "Connection refused: cannot connect to S3" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "Connection refused: cannot connect to S3" + + +def test_extract_error_message_with_warning(): + """Test extracting warning message from stderr.""" + stderr = "P00 WARN: configuration issue detected" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "WARN: configuration issue detected" + + +def test_extract_error_message_with_multiple_errors(): + """Test extracting multiple ERROR/WARN lines from stderr.""" + stderr = """P00 ERROR: first error +P00 WARN: warning message +P00 ERROR: second error""" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "ERROR: first error; WARN: warning message; ERROR: second error" + + +def test_extract_error_message_with_empty_output(): + """Test with empty stderr returns helpful message.""" + result = PostgreSQLBackups._extract_error_message("") + assert ( + result + == "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest" + ) + + +def test_extract_error_message_fallback_to_stderr_last_line(): + """Test fallback to last line of stderr when no ERROR/WARN markers.""" + stderr = "Line 1\nLine 2\nFinal error message" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "Final error message" + + +def test_extract_error_message_cleans_debug_prefix(): + """Test that debug prefixes like 'P00 ERROR:' are cleaned up.""" + stderr = "2025-11-07 07:21:11.120 P00 ERROR: test error message" + result = PostgreSQLBackups._extract_error_message(stderr) + assert result == "ERROR: test error message" + + def test_stanza_name(harness): assert ( harness.charm.backup.stanza_name @@ -694,6 +748,7 @@ def test_initialise_stanza(harness): stanza_creation_command = [ "charmed-postgresql.pgbackrest", "--config=/var/snap/charmed-postgresql/current/etc/pgbackrest/pgbackrest.conf", + "--log-level-stderr=warn", f"--stanza={harness.charm.backup.stanza_name}", "stanza-create", ] @@ -784,6 +839,14 @@ def test_check_stanza(harness): FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE ) + # Test when the failure in the stanza check is due to an archive timeout. + _execute_command.reset_mock() + _s3_initialization_set_failure.reset_mock() + _execute_command.return_value = (82, "", "fake stderr") + with pytest.raises(TimeoutError): + harness.charm.backup.check_stanza() + _s3_initialization_set_failure.assert_not_called() + _execute_command.reset_mock() _s3_initialization_set_failure.reset_mock() _execute_command.return_value = (0, "fake stdout", "")