canonical · marceloneppel · Dec 16, 2025 · Nov 28, 2025 · Dec 10, 2025 · Dec 12, 2025
@@ -34,10 +34,12 @@
     BACKUP_TYPE_OVERRIDES,
     BACKUP_USER,
     PATRONI_CONF_PATH,
+    PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE,
     PGBACKREST_BACKUP_ID_FORMAT,
     PGBACKREST_CONF_PATH,
     PGBACKREST_CONFIGURATION_FILE,
     PGBACKREST_EXECUTABLE,
+    PGBACKREST_LOG_LEVEL_STDERR,
     PGBACKREST_LOGROTATE_FILE,
     PGBACKREST_LOGS_PATH,
     POSTGRESQL_DATA_PATH,
@@ -183,7 +185,13 @@ def can_use_s3_repository(self) -> tuple[bool, str]:
 
         try:
             return_code, stdout, stderr = self._execute_command(
-                [PGBACKREST_EXECUTABLE, PGBACKREST_CONFIGURATION_FILE, "info", "--output=json"],
+                [
+                    PGBACKREST_EXECUTABLE,
+                    PGBACKREST_CONFIGURATION_FILE,
+                    PGBACKREST_LOG_LEVEL_STDERR,
+                    "info",
+                    "--output=json",
+                ],
                 timeout=30,
             )
         except TimeoutExpired as e:
@@ -195,7 +203,8 @@ def can_use_s3_repository(self) -> tuple[bool, str]:
 
         else:
             if return_code != 0:
-                logger.error(f"Failed to run pgbackrest: {stderr}")
+                extracted_error = self._extract_error_message(stderr)
+                logger.error(f"Failed to run pgbackrest: {extracted_error}")
                 return False, FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
 
         for stanza in json.loads(stdout):
@@ -357,6 +366,38 @@ def _execute_command(
         )
         return process.returncode, process.stdout.decode(), process.stderr.decode()
 
+    @staticmethod
+    def _extract_error_message(stderr: str) -> str:
+        """Extract key error message from pgBackRest stderr output.
+
+        Since we standardize all pgBackRest commands to use --log-level-stderr=warn,
+        all errors and warnings are consistently written to stderr. This makes error
+        extraction predictable and avoids potential log duplication issues.
+
+        Args:
+            stderr: Standard error from pgBackRest command containing errors/warnings.
+
+        Returns:
+            Extracted error message from stderr, prioritizing ERROR/WARN lines.
+        """
+        if not stderr.strip():
+            return f"Unknown error occurred. Please check the logs at {PGBACKREST_LOGS_PATH}"
+
+        # Extract lines with ERROR or WARN markers from pgBackRest stderr output
+        error_lines = []
+        for line in stderr.splitlines():
+            if "ERROR:" in line or "WARN:" in line:
+                # Clean up the line by removing debug prefixes like "P00  ERROR:"
+                cleaned = re.sub(r"^.*?(ERROR:|WARN:)", r"\1", line).strip()
+                error_lines.append(cleaned)
+
+        # If we found error/warning lines, return them joined
+        if error_lines:
+            return "; ".join(error_lines)
+
+        # Otherwise return the last non-empty line from stderr
+        return stderr.strip().splitlines()[-1]
+
     def _format_backup_list(self, backup_list) -> str:
         """Formats provided list of backups as a table."""
         s3_parameters, _ = self._retrieve_s3_parameters()
@@ -401,11 +442,13 @@ def _generate_backup_list_output(self) -> str:
         return_code, output, stderr = self._execute_command([
             PGBACKREST_EXECUTABLE,
             PGBACKREST_CONFIGURATION_FILE,
+            PGBACKREST_LOG_LEVEL_STDERR,
             "info",
             "--output=json",
         ])
         if return_code != 0:
-            raise ListBackupsError(f"Failed to list backups with error: {stderr}")
+            extracted_error = self._extract_error_message(stderr)
+            raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
 
         backups = json.loads(output)[0]["backup"]
         for backup in backups:
@@ -472,11 +515,13 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
         return_code, output, stderr = self._execute_command([
             PGBACKREST_EXECUTABLE,
             PGBACKREST_CONFIGURATION_FILE,
+            PGBACKREST_LOG_LEVEL_STDERR,
             "info",
             "--output=json",
         ])
         if return_code != 0:
-            raise ListBackupsError(f"Failed to list backups with error: {stderr}")
+            extracted_error = self._extract_error_message(stderr)
+            raise ListBackupsError(f"Failed to list backups with error: {extracted_error}")
 
         repository_info = next(iter(json.loads(output)), None)
 
@@ -506,12 +551,14 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
         return_code, output, stderr = self._execute_command([
             PGBACKREST_EXECUTABLE,
             PGBACKREST_CONFIGURATION_FILE,
+            PGBACKREST_LOG_LEVEL_STDERR,
             "repo-ls",
             "--recurse",
             "--output=json",
         ])
         if return_code != 0:
-            raise ListBackupsError(f"Failed to list repository with error: {stderr}")
+            extracted_error = self._extract_error_message(stderr)
+            raise ListBackupsError(f"Failed to list repository with error: {extracted_error}")
 
         repository = json.loads(output).items()
         if repository is None:
@@ -623,6 +670,7 @@ def _initialise_stanza(self, event: HookEvent) -> bool:
                     return_code, _, stderr = self._execute_command([
                         PGBACKREST_EXECUTABLE,
                         PGBACKREST_CONFIGURATION_FILE,
+                        PGBACKREST_LOG_LEVEL_STDERR,
                         f"--stanza={self.stanza_name}",
                         "stanza-create",
                     ])
@@ -681,12 +729,25 @@ def check_stanza(self) -> bool:
                     return_code, _, stderr = self._execute_command([
                         PGBACKREST_EXECUTABLE,
                         PGBACKREST_CONFIGURATION_FILE,
+                        PGBACKREST_LOG_LEVEL_STDERR,
                         f"--stanza={self.stanza_name}",
                         "check",
                     ])
+                    if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE:
+                        # Raise an error if the archive command timeouts, so the user has the possibility
+                        # to fix network issues and call juju resolve to re-trigger the hook that calls
+                        # this method.
+                        extracted_error = self._extract_error_message(stderr)
+                        logger.error(
+                            f"error: {extracted_error} - please fix the error and call juju resolve on this unit"
+                        )
+                        raise TimeoutError
                     if return_code != 0:
                         raise Exception(stderr)
             self.charm._set_primary_status_message()
+        except TimeoutError as e:
+            # Re-raise to put charm in error state (not blocked), allowing juju resolve
+            raise e
         except Exception:
             # If the check command doesn't succeed, remove the stanza name
             # and rollback the configuration.
@@ -735,13 +796,15 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
             return False
         return_code, _, stderr = self._execute_command([
             PGBACKREST_EXECUTABLE,
+            PGBACKREST_LOG_LEVEL_STDERR,
             "server-ping",
             "--io-timeout=10",
             self.charm.primary_endpoint,
         ])
         if return_code != 0:
+            extracted_error = self._extract_error_message(stderr)
             logger.warning(
-                f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {stderr}"
+                f"Failed to contact pgBackRest TLS server on {self.charm.primary_endpoint} with error {extracted_error}"
             )
         return return_code == 0
 
@@ -931,6 +994,7 @@ def _run_backup(
         command = [
             PGBACKREST_EXECUTABLE,
             PGBACKREST_CONFIGURATION_FILE,
+            PGBACKREST_LOG_LEVEL_STDERR,
             f"--stanza={self.stanza_name}",
             "--log-level-console=debug",
             f"--type={BACKUP_TYPE_OVERRIDES[backup_type]}",
@@ -967,7 +1031,8 @@ def _run_backup(
                 f"backup/{self.stanza_name}/{backup_id}/backup.log",
                 s3_parameters,
             )
-            error_message = f"Failed to backup PostgreSQL with error: {stderr}"
+            extracted_error = self._extract_error_message(stderr)
+            error_message = f"Failed to backup PostgreSQL with error: {extracted_error}"
             logger.error(f"Backup failed: {error_message}")
             event.fail(error_message)
         else:
@@ -1134,7 +1199,10 @@ def _on_restore_action(self, event):  # noqa: C901
             timeout=10,
         )
         if return_code != 0:
-            error_message = f"Failed to remove previous cluster information with error: {stderr}"
+            extracted_error = self._extract_error_message(stderr)
+            error_message = (
+                f"Failed to remove previous cluster information with error: {extracted_error}"
+            )
             logger.error(f"Restore failed: {error_message}")
             event.fail(error_message)
             return

@@ -26,6 +26,16 @@
 
 # Snap constants.
 PGBACKREST_EXECUTABLE = "charmed-postgresql.pgbackrest"
+# pgBackRest logging configuration
+# We use stderr for all error/warning output to have a consistent, predictable error extraction
+# mechanism. By default, pgBackRest uses stdout (console) for warnings, but we standardize on
+# stderr to avoid potential log duplication and to make error handling more reliable.
+# Reference: https://pgbackrest.org/configuration.html#section-log
+PGBACKREST_LOG_LEVEL_STDERR = "--log-level-stderr=warn"
+# pgBackRest error codes
+PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE = (
+    82  # Archive timeout - unable to archive WAL files within configured timeout period
+)
 
 SNAP_COMMON_PATH = "/var/snap/charmed-postgresql/common"
 SNAP_CURRENT_PATH = "/var/snap/charmed-postgresql/current"

@@ -13,7 +13,7 @@
 from ops.testing import Harness
 from tenacity import RetryError, wait_fixed
 
-from backups import ListBackupsError
+from backups import ListBackupsError, PostgreSQLBackups
 from charm import PostgresqlOperatorCharm
 from constants import PEER
 
@@ -37,6 +37,60 @@ def harness():
     harness.cleanup()
 
 
+def test_extract_error_message_with_error_in_stderr():
+    """Test extracting error from stderr with ERROR marker."""
+    stderr = """2025-11-07 07:21:11.120 P00  ERROR: [056]: unable to find primary cluster - cannot proceed
+                                    HINT: are all available clusters in recovery?"""
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "ERROR: [056]: unable to find primary cluster - cannot proceed"
+
+
+def test_extract_error_message_with_plain_stderr():
+    """Test extracting error from stderr when no ERROR marker."""
+    stderr = "Connection refused: cannot connect to S3"
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "Connection refused: cannot connect to S3"
+
+
+def test_extract_error_message_with_warning():
+    """Test extracting warning message from stderr."""
+    stderr = "P00  WARN: configuration issue detected"
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "WARN: configuration issue detected"
+
+
+def test_extract_error_message_with_multiple_errors():
+    """Test extracting multiple ERROR/WARN lines from stderr."""
+    stderr = """P00 ERROR: first error
+P00 WARN: warning message
+P00 ERROR: second error"""
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "ERROR: first error; WARN: warning message; ERROR: second error"
+
+
+def test_extract_error_message_with_empty_output():
+    """Test with empty stderr returns helpful message."""
+    result = PostgreSQLBackups._extract_error_message("")
+    assert (
+        result
+        == "Unknown error occurred. Please check the logs at /var/snap/charmed-postgresql/common/var/log/pgbackrest"
+    )
+
+
+def test_extract_error_message_fallback_to_stderr_last_line():
+    """Test fallback to last line of stderr when no ERROR/WARN markers."""
+    stderr = "Line 1\nLine 2\nFinal error message"
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "Final error message"
+
+
+def test_extract_error_message_cleans_debug_prefix():
+    """Test that debug prefixes like 'P00  ERROR:' are cleaned up."""
+    stderr = "2025-11-07 07:21:11.120 P00  ERROR: test error message"
+    result = PostgreSQLBackups._extract_error_message(stderr)
+    assert result == "ERROR: test error message"
+
+
 def test_stanza_name(harness):
     assert (
         harness.charm.backup.stanza_name
@@ -694,6 +748,7 @@ def test_initialise_stanza(harness):
         stanza_creation_command = [
             "charmed-postgresql.pgbackrest",
             "--config=/var/snap/charmed-postgresql/current/etc/pgbackrest/pgbackrest.conf",
+            "--log-level-stderr=warn",
             f"--stanza={harness.charm.backup.stanza_name}",
             "stanza-create",
         ]
@@ -784,6 +839,14 @@ def test_check_stanza(harness):
             FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
         )
 
+        # Test when the failure in the stanza check is due to an archive timeout.
+        _execute_command.reset_mock()
+        _s3_initialization_set_failure.reset_mock()
+        _execute_command.return_value = (82, "", "fake stderr")
+        with pytest.raises(TimeoutError):
+            harness.charm.backup.check_stanza()
+        _s3_initialization_set_failure.assert_not_called()
+
         _execute_command.reset_mock()
         _s3_initialization_set_failure.reset_mock()
         _execute_command.return_value = (0, "fake stdout", "")