3636 BACKUP_TYPE_OVERRIDES ,
3737 BACKUP_USER ,
3838 PATRONI_CONF_PATH ,
39+ PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE ,
3940 PGBACKREST_BACKUP_ID_FORMAT ,
4041 PGBACKREST_CONF_PATH ,
4142 PGBACKREST_CONFIGURATION_FILE ,
@@ -210,7 +211,8 @@ def can_use_s3_repository(self) -> tuple[bool, str | None]:
210211
211212 else :
212213 if return_code != 0 :
213- logger .error (f"Failed to run pgbackrest: { stderr } " )
214+ extracted_error = self ._extract_error_message (stdout , stderr )
215+ logger .error (f"Failed to run pgbackrest: { extracted_error } " )
214216 return False , FAILED_TO_INITIALIZE_STANZA_ERROR_MESSAGE
215217
216218 for stanza in json .loads (stdout ):
@@ -369,6 +371,41 @@ def result():
369371 )
370372 return process .returncode , process .stdout .decode (), process .stderr .decode ()
371373
374+ @staticmethod
375+ def _extract_error_message (stdout : str , stderr : str ) -> str :
376+ """Extract key error message from pgBackRest output.
377+
378+ Args:
379+ stdout: Standard output from pgBackRest command.
380+ stderr: Standard error from pgBackRest command.
381+
382+ Returns:
383+ Extracted error message, prioritizing ERROR/WARN lines from output.
384+ """
385+ combined_output = f"{ stdout } \n { stderr } " .strip ()
386+ if not combined_output :
387+ return f"Unknown error occurred. Please check the logs at { PGBACKREST_LOGS_PATH } "
388+
389+ # Extract lines with ERROR or WARN markers from pgBackRest output
390+ error_lines = []
391+ for line in combined_output .splitlines ():
392+ if "ERROR:" in line or "WARN:" in line :
393+ # Clean up the line by removing debug prefixes like "P00 ERROR:"
394+ cleaned = re .sub (r"^.*?(ERROR:|WARN:)" , r"\1" , line ).strip ()
395+ error_lines .append (cleaned )
396+
397+ # If we found error/warning lines, return them joined
398+ if error_lines :
399+ return "; " .join (error_lines )
400+
401+ # Otherwise return the last non-empty line from stderr or stdout
402+ if stderr .strip ():
403+ return stderr .strip ().splitlines ()[- 1 ]
404+ if stdout .strip ():
405+ return stdout .strip ().splitlines ()[- 1 ]
406+
407+ return f"Unknown error occurred. Please check the logs at { PGBACKREST_LOGS_PATH } "
408+
372409 def _format_backup_list (self , backup_list ) -> str :
373410 """Formats provided list of backups as a table."""
374411 s3_parameters , _ = self ._retrieve_s3_parameters ()
@@ -417,7 +454,8 @@ def _generate_backup_list_output(self) -> str:
417454 "--output=json" ,
418455 ])
419456 if return_code != 0 :
420- raise ListBackupsError (f"Failed to list backups with error: { stderr } " )
457+ extracted_error = self ._extract_error_message (output , stderr )
458+ raise ListBackupsError (f"Failed to list backups with error: { extracted_error } " )
421459
422460 backups = json .loads (output )[0 ]["backup" ]
423461 for backup in backups :
@@ -490,7 +528,8 @@ def _list_backups(self, show_failed: bool, parse=True) -> dict[str, tuple[str, s
490528 "--output=json" ,
491529 ])
492530 if return_code != 0 :
493- raise ListBackupsError (f"Failed to list backups with error: { stderr } " )
531+ extracted_error = self ._extract_error_message (output , stderr )
532+ raise ListBackupsError (f"Failed to list backups with error: { extracted_error } " )
494533
495534 repository_info = next (iter (json .loads (output )), None )
496535
@@ -525,7 +564,8 @@ def _list_timelines(self) -> dict[str, tuple[str, str]]:
525564 "--output=json" ,
526565 ])
527566 if return_code != 0 :
528- raise ListBackupsError (f"Failed to list repository with error: { stderr } " )
567+ extracted_error = self ._extract_error_message (output , stderr )
568+ raise ListBackupsError (f"Failed to list repository with error: { extracted_error } " )
529569
530570 repository = json .loads (output ).items ()
531571 if repository is None :
@@ -692,15 +732,27 @@ def check_stanza(self) -> bool:
692732 # for that or else the s3 initialization sequence will fail.
693733 for attempt in Retrying (stop = stop_after_attempt (6 ), wait = wait_fixed (10 ), reraise = True ):
694734 with attempt :
695- return_code , _ , stderr = self ._execute_command ([
735+ return_code , stdout , stderr = self ._execute_command ([
696736 PGBACKREST_EXECUTABLE ,
697737 PGBACKREST_CONFIGURATION_FILE ,
698738 f"--stanza={ self .stanza_name } " ,
699739 "check" ,
700740 ])
741+ if return_code == PGBACKREST_ARCHIVE_TIMEOUT_ERROR_CODE :
742+ # Raise an error if the archive command timeouts, so the user has the possibility
743+ # to fix network issues and call juju resolve to re-trigger the hook that calls
744+ # this method.
745+ extracted_error = self ._extract_error_message (stdout , stderr )
746+ logger .error (
747+ f"error: { extracted_error } - please fix the error and call juju resolve on this unit"
748+ )
749+ raise TimeoutError
701750 if return_code != 0 :
702751 raise Exception (stderr )
703752 self .charm ._set_primary_status_message ()
753+ except TimeoutError :
754+ # Re-raise to put charm in error state (not blocked), allowing juju resolve
755+ raise
704756 except Exception as e :
705757 # If the check command doesn't succeed, remove the stanza name
706758 # and rollback the configuration.
@@ -747,15 +799,16 @@ def _is_primary_pgbackrest_service_running(self) -> bool:
747799 if not self .charm .primary_endpoint :
748800 logger .warning ("Failed to contact pgBackRest TLS server: no primary endpoint" )
749801 return False
750- return_code , _ , stderr = self ._execute_command ([
802+ return_code , stdout , stderr = self ._execute_command ([
751803 PGBACKREST_EXECUTABLE ,
752804 "server-ping" ,
753805 "--io-timeout=10" ,
754806 self .charm .primary_endpoint ,
755807 ])
756808 if return_code != 0 :
809+ extracted_error = self ._extract_error_message (stdout , stderr )
757810 logger .warning (
758- f"Failed to contact pgBackRest TLS server on { self .charm .primary_endpoint } with error { stderr } "
811+ f"Failed to contact pgBackRest TLS server on { self .charm .primary_endpoint } with error { extracted_error } "
759812 )
760813 return return_code == 0
761814
@@ -970,7 +1023,8 @@ def _run_backup(
9701023 f"backup/{ self .stanza_name } /{ backup_id } /backup.log" ,
9711024 s3_parameters ,
9721025 )
973- error_message = f"Failed to backup PostgreSQL with error: { stderr } "
1026+ extracted_error = self ._extract_error_message (stdout , stderr )
1027+ error_message = f"Failed to backup PostgreSQL with error: { extracted_error } "
9741028 logger .error (f"Backup failed: { error_message } " )
9751029 event .fail (error_message )
9761030 else :
@@ -1125,7 +1179,7 @@ def _on_restore_action(self, event): # noqa: C901
11251179
11261180 # Remove previous cluster information to make it possible to initialise a new cluster.
11271181 logger .info ("Removing previous cluster information" )
1128- return_code , _ , stderr = self ._execute_command (
1182+ return_code , stdout , stderr = self ._execute_command (
11291183 [
11301184 "charmed-postgresql.patronictl" ,
11311185 "-c" ,
@@ -1137,7 +1191,10 @@ def _on_restore_action(self, event): # noqa: C901
11371191 timeout = 10 ,
11381192 )
11391193 if return_code != 0 :
1140- error_message = f"Failed to remove previous cluster information with error: { stderr } "
1194+ extracted_error = self ._extract_error_message (stdout , stderr )
1195+ error_message = (
1196+ f"Failed to remove previous cluster information with error: { extracted_error } "
1197+ )
11411198 logger .error (f"Restore failed: { error_message } " )
11421199 event .fail (error_message )
11431200 return
0 commit comments