Skip to content

Commit 2c0fe44

Browse files
authored
DPE-4699 backup logging and other fixes (#444)
* better messaging for backup and extra timeout for service stop * refinements and bug fixes: * move s3 relation validation to lib * fixed validation call (missing property decorator) * wont rotate logs if mysqld not running * kill client connection prior to stopping service * enable streamed debug log for backup download * libpatch bump * rsync changes from vm * replicated from vm * bump libs * testing split backup test * leverage greater amount of runners * strip output * set offline mode on instance * bump lib
1 parent 36254a9 commit 2c0fe44

File tree

13 files changed

+1582
-649
lines changed

13 files changed

+1582
-649
lines changed

lib/charms/data_platform_libs/v0/upgrade.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def restart(self, event) -> None:
285285

286286
# Increment this PATCH version before using `charmcraft publish-lib` or reset
287287
# to 0 if you are raising the major API version
288-
LIBPATCH = 17
288+
LIBPATCH = 18
289289

290290
PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]
291291

@@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
921921
self.charm.unit.status = WaitingStatus("other units upgrading first...")
922922
self.peer_relation.data[self.charm.unit].update({"state": "ready"})
923923

924-
if self.charm.app.planned_units() == 1:
924+
if len(self.app_units) == 1:
925925
# single unit upgrade, emit upgrade_granted event right away
926926
getattr(self.on, "upgrade_granted").emit()
927927

lib/charms/mysql/v0/backups.py

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,14 @@ def is_unit_blocked(self) -> bool:
5555
from charms.mysql.v0.mysql import (
5656
MySQLConfigureInstanceError,
5757
MySQLCreateClusterError,
58+
MySQLCreateClusterSetError,
5859
MySQLDeleteTempBackupDirectoryError,
5960
MySQLDeleteTempRestoreDirectoryError,
6061
MySQLEmptyDataDirectoryError,
6162
MySQLExecuteBackupCommandsError,
6263
MySQLGetMemberStateError,
6364
MySQLInitializeJujuOperationsTableError,
65+
MySQLKillSessionError,
6466
MySQLOfflineModeAndHiddenInstanceExistsError,
6567
MySQLPrepareBackupForRestoreError,
6668
MySQLRescanClusterError,
@@ -80,13 +82,14 @@ def is_unit_blocked(self) -> bool:
8082
from ops.charm import ActionEvent
8183
from ops.framework import Object
8284
from ops.jujuversion import JujuVersion
83-
from ops.model import ActiveStatus, BlockedStatus
85+
from ops.model import BlockedStatus, MaintenanceStatus
8486

8587
from constants import MYSQL_DATA_DIR
8688

8789
logger = logging.getLogger(__name__)
8890

8991
MYSQL_BACKUPS = "mysql-backups"
92+
S3_INTEGRATOR_RELATION_NAME = "s3-parameters"
9093

9194
# The unique Charmhub library identifier, never change it
9295
LIBID = "183844304be247129572309a5fb1e47c"
@@ -96,7 +99,7 @@ def is_unit_blocked(self) -> bool:
9699

97100
# Increment this PATCH version before using `charmcraft publish-lib` or reset
98101
# to 0 if you are raising the major API version
99-
LIBPATCH = 8
102+
LIBPATCH = 10
100103

101104

102105
if typing.TYPE_CHECKING:
@@ -117,6 +120,10 @@ def __init__(self, charm: "MySQLOperatorCharm", s3_integrator: S3Requirer) -> No
117120
self.framework.observe(self.charm.on.restore_action, self._on_restore)
118121

119122
# ------------------ Helpers ------------------
123+
@property
124+
def _s3_integrator_relation_exists(self) -> bool:
125+
"""Returns whether a relation with the s3-integrator exists."""
126+
return bool(self.model.get_relation(S3_INTEGRATOR_RELATION_NAME))
120127

121128
def _retrieve_s3_parameters(self) -> Tuple[Dict[str, str], List[str]]:
122129
"""Retrieve S3 parameters from the S3 integrator relation.
@@ -176,11 +183,7 @@ def _upload_logs_to_s3(
176183
177184
Returns: bool indicating success
178185
"""
179-
logs = f"""Stdout:
180-
{stdout}
181-
182-
Stderr:
183-
{stderr}"""
186+
logs = f"Stdout:\n{stdout}\n\nStderr:\n{stderr}"
184187
logger.debug(f"Logs to upload to S3 at location {log_filename}:\n{logs}")
185188

186189
logger.info(
@@ -206,7 +209,7 @@ def _on_list_backups(self, event: ActionEvent) -> None:
206209
207210
List backups available to restore by this application.
208211
"""
209-
if not self.charm.s3_integrator_relation_exists:
212+
if not self._s3_integrator_relation_exists:
210213
event.fail("Missing relation with S3 integrator charm")
211214
return
212215

@@ -222,7 +225,9 @@ def _on_list_backups(self, event: ActionEvent) -> None:
222225
event.set_results({"backups": self._format_backups_list(backups)})
223226
except Exception as e:
224227
error_message = (
225-
e.message if hasattr(e, "message") else "Failed to retrieve backup ids from S3"
228+
getattr(e, "message")
229+
if hasattr(e, "message")
230+
else "Failed to retrieve backup ids from S3"
226231
)
227232
logger.error(error_message)
228233
event.fail(error_message)
@@ -233,7 +238,7 @@ def _on_create_backup(self, event: ActionEvent) -> None:
233238
"""Handle the create backup action."""
234239
logger.info("A backup has been requested on unit")
235240

236-
if not self.charm.s3_integrator_relation_exists:
241+
if not self._s3_integrator_relation_exists:
237242
logger.error("Backup failed: missing relation with S3 integrator charm")
238243
event.fail("Missing relation with S3 integrator charm")
239244
return
@@ -263,12 +268,13 @@ def _on_create_backup(self, event: ActionEvent) -> None:
263268

264269
# Test uploading metadata to S3 to test credentials before backup
265270
juju_version = JujuVersion.from_environ()
266-
metadata = f"""Date Backup Requested: {datetime_backup_requested}
267-
Model Name: {self.model.name}
268-
Application Name: {self.model.app.name}
269-
Unit Name: {self.charm.unit.name}
270-
Juju Version: {str(juju_version)}
271-
"""
271+
metadata = (
272+
f"Date Backup Requested: {datetime_backup_requested}\n"
273+
f"Model Name: {self.model.name}\n"
274+
f"Application Name: {self.model.app.name}\n"
275+
f"Unit Name: {self.charm.unit.name}\n"
276+
f"Juju Version: {str(juju_version)}\n"
277+
)
272278

273279
if not upload_content_to_s3(metadata, f"{backup_path}.metadata", s3_parameters):
274280
logger.error("Backup failed: Failed to upload metadata to provided S3")
@@ -313,6 +319,7 @@ def _on_create_backup(self, event: ActionEvent) -> None:
313319
"backup-id": datetime_backup_requested,
314320
}
315321
)
322+
self.charm._on_update_status(None)
316323

317324
def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]:
318325
"""Validates whether this unit can perform a backup.
@@ -380,8 +387,9 @@ def _backup(self, backup_path: str, s3_parameters: Dict) -> Tuple[bool, Optional
380387
Returns: tuple of (success, error_message)
381388
"""
382389
try:
390+
self.charm.unit.status = MaintenanceStatus("Running backup...")
383391
logger.info("Running the xtrabackup commands")
384-
stdout = self.charm._mysql.execute_backup_commands(
392+
stdout, _ = self.charm._mysql.execute_backup_commands(
385393
backup_path,
386394
s3_parameters,
387395
)
@@ -436,7 +444,7 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
436444
437445
Returns: a boolean indicating whether restore should be run
438446
"""
439-
if not self.charm.s3_integrator_relation_exists:
447+
if not self._s3_integrator_relation_exists:
440448
error_message = "Missing relation with S3 integrator charm"
441449
logger.error(f"Restore failed: {error_message}")
442450
event.fail(error_message)
@@ -481,7 +489,7 @@ def _on_restore(self, event: ActionEvent) -> None:
481489
if not self._pre_restore_checks(event):
482490
return
483491

484-
backup_id = event.params.get("backup-id").strip().strip("/")
492+
backup_id = event.params["backup-id"].strip().strip("/")
485493
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
486494

487495
# Retrieve and validate missing S3 parameters
@@ -500,6 +508,7 @@ def _on_restore(self, event: ActionEvent) -> None:
500508
return
501509

502510
# Run operations to prepare for the restore
511+
self.charm.unit.status = MaintenanceStatus("Running pre-restore operations")
503512
success, error_message = self._pre_restore()
504513
if not success:
505514
logger.error(f"Restore failed: {error_message}")
@@ -520,6 +529,7 @@ def _on_restore(self, event: ActionEvent) -> None:
520529
return
521530

522531
# Run post-restore operations
532+
self.charm.unit.status = MaintenanceStatus("Running post-restore operations")
523533
success, error_message = self._post_restore()
524534
if not success:
525535
logger.error(f"Restore failed: {error_message}")
@@ -533,22 +543,32 @@ def _on_restore(self, event: ActionEvent) -> None:
533543
"completed": "ok",
534544
}
535545
)
546+
# update status as soon as possible
547+
self.charm._on_update_status(None)
536548

537-
def _pre_restore(self) -> Tuple[bool, Optional[str]]:
549+
def _pre_restore(self) -> Tuple[bool, str]:
538550
"""Perform operations that need to be done before performing a restore.
539551
540552
Returns: tuple of (success, error_message)
541553
"""
554+
if not self.charm._mysql.is_mysqld_running():
555+
return True, ""
556+
542557
try:
558+
logger.info("Stopping mysqld before restoring the backup")
559+
self.charm._mysql.kill_client_sessions()
560+
self.charm._mysql.set_instance_offline_mode(True)
543561
self.charm._mysql.stop_mysqld()
562+
except MySQLKillSessionError:
563+
return False, "Failed to kill client sessions"
564+
except MySQLSetInstanceOfflineModeError:
565+
return False, "Failed to set instance as offline before restoring the backup"
544566
except MySQLStopMySQLDError:
545567
return False, "Failed to stop mysqld"
546568

547-
return True, None
569+
return True, ""
548570

549-
def _restore(
550-
self, backup_id: str, s3_parameters: Dict[str, str]
551-
) -> Tuple[bool, bool, Optional[str]]:
571+
def _restore(self, backup_id: str, s3_parameters: Dict[str, str]) -> Tuple[bool, bool, str]:
552572
"""Run the restore operations.
553573
554574
Args:
@@ -558,18 +578,21 @@ def _restore(
558578
Returns: tuple of (success, recoverable_error, error_message)
559579
"""
560580
try:
561-
logger.info("Running xbcloud get commands to retrieve the backup")
581+
logger.info(
582+
"Running xbcloud get commands to retrieve the backup\n"
583+
"This operation can take long time depending on backup size and network speed"
584+
)
585+
self.charm.unit.status = MaintenanceStatus("Downloading backup...")
562586
stdout, stderr, backup_location = self.charm._mysql.retrieve_backup_with_xbcloud(
563587
backup_id,
564588
s3_parameters,
565589
)
566-
logger.debug(f"Stdout of xbcloud get commands: {stdout}")
567-
logger.debug(f"Stderr of xbcloud get commands: {stderr}")
568590
except MySQLRetrieveBackupWithXBCloudError:
569591
return False, True, f"Failed to retrieve backup {backup_id}"
570592

571593
try:
572594
logger.info("Preparing retrieved backup using xtrabackup prepare")
595+
self.charm.unit.status = MaintenanceStatus("Preparing for restore backup...")
573596
stdout, stderr = self.charm._mysql.prepare_backup_for_restore(backup_location)
574597
logger.debug(f"Stdout of xtrabackup prepare command: {stdout}")
575598
logger.debug(f"Stderr of xtrabackup prepare command: {stderr}")
@@ -583,16 +606,17 @@ def _restore(
583606
return False, False, "Failed to empty the data directory"
584607

585608
try:
609+
self.charm.unit.status = MaintenanceStatus("Restoring backup...")
586610
logger.info("Restoring the backup")
587611
stdout, stderr = self.charm._mysql.restore_backup(backup_location)
588612
logger.debug(f"Stdout of xtrabackup move-back command: {stdout}")
589613
logger.debug(f"Stderr of xtrabackup move-back command: {stderr}")
590614
except MySQLRestoreBackupError:
591615
return False, False, f"Failed to restore backup {backup_id}"
592616

593-
return True, True, None
617+
return True, True, ""
594618

595-
def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, Optional[str]]:
619+
def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]:
596620
"""Run idempotent operations run after restoring a backup.
597621
598622
Returns tuple of (success, error_message)
@@ -613,9 +637,9 @@ def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, Optional[str]]:
613637
except MySQLStartMySQLDError:
614638
return False, "Failed to start mysqld"
615639

616-
return True, None
640+
return True, ""
617641

618-
def _post_restore(self) -> Tuple[bool, Optional[str]]:
642+
def _post_restore(self) -> Tuple[bool, str]:
619643
"""Run operations required after restoring a backup.
620644
621645
Returns: tuple of (success, error_message)
@@ -638,24 +662,18 @@ def _post_restore(self) -> Tuple[bool, Optional[str]]:
638662
logger.info("Creating cluster on restored node")
639663
unit_label = self.charm.unit.name.replace("/", "-")
640664
self.charm._mysql.create_cluster(unit_label)
665+
self.charm._mysql.create_cluster_set()
641666
self.charm._mysql.initialize_juju_units_operations_table()
642667

643668
self.charm._mysql.rescan_cluster()
644669

645-
logger.info("Retrieving instance cluster state and role")
646-
state, role = self.charm._mysql.get_member_state()
647670
except MySQLCreateClusterError:
648671
return False, "Failed to create InnoDB cluster on restored instance"
672+
except MySQLCreateClusterSetError:
673+
return False, "Failed to create InnoDB cluster-set on restored instance"
649674
except MySQLInitializeJujuOperationsTableError:
650675
return False, "Failed to initialize the juju operations table"
651676
except MySQLRescanClusterError:
652677
return False, "Failed to rescan the cluster"
653-
except MySQLGetMemberStateError:
654-
return False, "Failed to retrieve member state in restored instance"
655-
656-
self.charm.unit_peer_data["member-role"] = role
657-
self.charm.unit_peer_data["member-state"] = state
658678

659-
self.charm.unit.status = ActiveStatus()
660-
661-
return True, None
679+
return True, ""

0 commit comments

Comments
 (0)