Skip to content

Commit bb5a1d2

Browse files
[DPE-6488] Address slow mysqld start upon unit reboot (#615)
* Address slow mysqld start upon unit reboot * Handle race conditions in snap restarts and update status executions + clean up error logging when recovering * Fix failing unit test * Run format * Use simpler query for cluster metadata using mysqlsh * Fix issue with get_cluster_metadata query when async setup used * Fix failing unittest * Address PR feedback * Remove redundant check when executing manual rejoin * Bump libpatch of the mysql charm lib * Increase number of log lines retrieved in assertion to account for error traces * Address edge case in rollback test where snap refresh was unnecessary * Use status log to determine successful rollback instead of debug logs
1 parent 024e846 commit bb5a1d2

17 files changed

+252
-110
lines changed

lib/charms/mysql/v0/async_replication.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ def _on_consumer_relation_created(self, event):
730730
"""Handle the async_replica relation being created on the leader unit."""
731731
if not self._charm.unit.is_leader():
732732
return
733-
if not self._charm.unit_initialized and not self.returning_cluster:
733+
if not self._charm.unit_initialized() and not self.returning_cluster:
734734
# avoid running too early for non returning clusters
735735
self._charm.unit.status = BlockedStatus(
736736
"Wait until unit is initialized before running create-replication on offer side"

lib/charms/mysql/v0/mysql.py

Lines changed: 90 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
133133
# Increment this major API version when introducing breaking changes
134134
LIBAPI = 0
135135

136-
LIBPATCH = 84
136+
LIBPATCH = 85
137137

138138
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
139139
UNIT_ADD_LOCKNAME = "unit-add"
@@ -418,6 +418,10 @@ class MySQLPluginInstallError(Error):
418418
"""Exception raised when there is an issue installing a MySQL plugin."""
419419

420420

421+
class MySQLClusterMetadataExistsError(Error):
422+
"""Exception raised when there is an issue checking if cluster metadata exists."""
423+
424+
421425
@dataclasses.dataclass
422426
class RouterUser:
423427
"""MySQL Router user."""
@@ -618,8 +622,15 @@ def cluster_initialized(self) -> bool:
618622
return False
619623

620624
for unit in self.app_units:
621-
if self._mysql.cluster_metadata_exists(self.get_unit_address(unit)):
622-
return True
625+
try:
626+
if unit != self.unit and self._mysql.cluster_metadata_exists(
627+
self.get_unit_address(unit)
628+
):
629+
return True
630+
elif self._mysql.cluster_metadata_exists():
631+
return True
632+
except MySQLClusterMetadataExistsError:
633+
pass
623634

624635
return False
625636

@@ -660,11 +671,6 @@ def unit_configured(self) -> bool:
660671
self.get_unit_address(self.unit), self.unit_label
661672
)
662673

663-
@property
664-
def unit_initialized(self) -> bool:
665-
"""Check if the unit is added to the cluster."""
666-
return self._mysql.cluster_metadata_exists(self.get_unit_address(self.unit))
667-
668674
@property
669675
def app_peer_data(self) -> Union[ops.RelationDataContent, dict]:
670676
"""Application peer relation data object."""
@@ -743,6 +749,15 @@ def removing_unit(self) -> bool:
743749
"""Check if the unit is being removed."""
744750
return self.unit_peer_data.get("unit-status") == "removing"
745751

752+
def unit_initialized(self, raise_exceptions: bool = False) -> bool:
753+
"""Check if the unit is added to the cluster."""
754+
try:
755+
return self._mysql.cluster_metadata_exists()
756+
except MySQLClusterMetadataExistsError:
757+
if raise_exceptions:
758+
raise
759+
return False
760+
746761
def peer_relation_data(self, scope: Scopes) -> DataPeerData:
747762
"""Returns the peer relation data per scope."""
748763
if scope == APP_SCOPE:
@@ -1671,35 +1686,62 @@ def is_cluster_in_cluster_set(self, cluster_name: str) -> Optional[bool]:
16711686

16721687
return cluster_name in cs_status["clusters"]
16731688

1674-
def cluster_metadata_exists(self, from_instance: str) -> bool:
1675-
"""Check if this cluster metadata exists on database."""
1676-
check_cluster_metadata_commands = (
1677-
"result = session.run_sql(\"SHOW DATABASES LIKE 'mysql_innodb_cluster_metadata'\")",
1678-
"content = result.fetch_all()",
1679-
"if content:",
1680-
(
1681-
' result = session.run_sql("SELECT cluster_name FROM mysql_innodb_cluster_metadata'
1682-
f".clusters where cluster_name = '{self.cluster_name}';\")"
1683-
),
1684-
" print(bool(result.fetch_one()))",
1685-
"else:",
1686-
" print(False)",
1689+
def cluster_metadata_exists(self, from_instance: Optional[str] = None) -> bool:
1690+
"""Check if this cluster metadata exists on database.
1691+
1692+
Use mysqlsh when querying clusters from remote instances. However, use
1693+
mysqlcli when querying locally since this method can be called before
1694+
the cluster is initialized (before serverconfig and root users are set up
1695+
correctly)
1696+
"""
1697+
get_clusters_query = (
1698+
"SELECT cluster_name "
1699+
"FROM mysql_innodb_cluster_metadata.clusters "
1700+
"WHERE EXISTS ("
1701+
"SELECT * "
1702+
"FROM information_schema.schemata "
1703+
"WHERE schema_name = 'mysql_innodb_cluster_metadata'"
1704+
")"
16871705
)
16881706

1707+
if from_instance:
1708+
check_cluster_metadata_commands = (
1709+
f'cursor = session.run_sql("{get_clusters_query}")',
1710+
"print(cursor.fetch_all())",
1711+
)
1712+
1713+
try:
1714+
output = self._run_mysqlsh_script(
1715+
"\n".join(check_cluster_metadata_commands),
1716+
user=self.server_config_user,
1717+
password=self.server_config_password,
1718+
host=self.instance_def(self.server_config_user, from_instance),
1719+
timeout=60,
1720+
exception_as_warning=True,
1721+
)
1722+
except MySQLClientError:
1723+
logger.warning(f"Failed to check if cluster metadata exists {from_instance=}")
1724+
raise MySQLClusterMetadataExistsError(
1725+
f"Failed to check if cluster metadata exists {from_instance=}"
1726+
)
1727+
1728+
return self.cluster_name in output
1729+
16891730
try:
1690-
output = self._run_mysqlsh_script(
1691-
"\n".join(check_cluster_metadata_commands),
1692-
user=self.server_config_user,
1693-
password=self.server_config_password,
1694-
host=self.instance_def(self.server_config_user, from_instance),
1731+
output = self._run_mysqlcli_script(
1732+
(get_clusters_query,),
1733+
user=ROOT_USERNAME,
1734+
password=self.root_password,
16951735
timeout=60,
16961736
exception_as_warning=True,
1737+
log_errors=False,
16971738
)
16981739
except MySQLClientError:
1699-
logger.warning(f"Failed to check if cluster metadata exists {from_instance=}")
1700-
return False
1740+
logger.warning("Failed to check if local cluster metadata exists")
1741+
raise MySQLClusterMetadataExistsError("Failed to check if cluster metadata exists")
17011742

1702-
return output.strip() == "True"
1743+
cluster_names = [entry[0].strip() for entry in output]
1744+
return self.cluster_name in cluster_names
17031745

17041746
def rejoin_cluster(self, cluster_name) -> None:
17051747
"""Try to rejoin a cluster to the cluster set."""
@@ -1939,8 +1981,11 @@ def rescan_cluster(
19391981

19401982
def is_instance_in_cluster(self, unit_label: str) -> bool:
19411983
"""Confirm if instance is in the cluster."""
1942-
if not self.cluster_metadata_exists(self.instance_address):
1943-
# early return if instance has no cluster metadata
1984+
try:
1985+
if not self.cluster_metadata_exists(self.instance_address):
1986+
# early return if instance has no cluster metadata
1987+
return False
1988+
except MySQLClusterMetadataExistsError:
19441989
return False
19451990

19461991
commands = (
@@ -3244,21 +3289,26 @@ def kill_client_sessions(self) -> None:
32443289
logger.error("Failed to kill external sessions")
32453290
raise MySQLKillSessionError
32463291

3247-
def check_mysqlsh_connection(self) -> bool:
3248-
"""Checks if it is possible to connect to the server with mysqlsh."""
3249-
connect_commands = 'session.run_sql("SELECT 1")'
3292+
def check_mysqlcli_connection(self) -> bool:
3293+
"""Checks if it is possible to connect to the server with mysqlcli."""
3294+
connect_commands = ("SELECT 1",)
32503295

32513296
try:
3252-
self._run_mysqlsh_script(
3297+
self._run_mysqlcli_script(
32533298
connect_commands,
32543299
user=self.server_config_user,
32553300
password=self.server_config_password,
3256-
host=self.instance_def(self.server_config_user),
32573301
)
32583302
return True
32593303
except MySQLClientError:
3260-
logger.error("Failed to connect to MySQL with mysqlsh")
3261-
return False
3304+
logger.warning("Failed to connect to MySQL with mysqlcli with server config user")
3305+
3306+
try:
3307+
self._run_mysqlcli_script(connect_commands)
3308+
return True
3309+
except MySQLClientError:
3310+
logger.error("Failed to connect to MySQL with mysqlcli with default root user")
3311+
return False
32623312

32633313
def get_pid_of_port_3306(self) -> Optional[str]:
32643314
"""Retrieves the PID of the process that is bound to port 3306."""
@@ -3427,6 +3477,7 @@ def _run_mysqlcli_script(
34273477
password: Optional[str] = None,
34283478
timeout: Optional[int] = None,
34293479
exception_as_warning: bool = False,
3480+
log_errors: bool = False,
34303481
) -> list:
34313482
"""Execute a MySQL CLI script.
34323483
@@ -3442,6 +3493,7 @@ def _run_mysqlcli_script(
34423493
password: (optional) password to invoke the mysql cli script with
34433494
timeout: (optional) time before the query should timeout
34443495
exception_as_warning: (optional) whether the exception should be treated as warning
3496+
log_errors: (optional) whether errors in the output should be logged
34453497
"""
34463498
raise NotImplementedError
34473499

lib/charms/mysql/v0/tls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def _on_set_tls_private_key(self, event: ActionEvent) -> None:
9191

9292
def _on_tls_relation_joined(self, event) -> None:
9393
"""Request certificate when TLS relation joined."""
94-
if not self.charm.unit_initialized:
94+
if not self.charm.unit_initialized():
9595
event.defer()
9696
return
9797
self._request_certificate(None)

src/charm.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None:
385385
def _on_database_storage_detaching(self, _) -> None:
386386
"""Handle the database storage detaching event."""
387387
# Only executes if the unit was initialised
388-
if not self.unit_initialized:
388+
if not self.unit_initialized():
389389
return
390390

391391
# No need to remove the instance from the cluster if it is not a member of the cluster
@@ -422,15 +422,15 @@ def _get_leader_unit() -> Optional[Unit]:
422422
# Inform other hooks of current status
423423
self.unit_peer_data["unit-status"] = "removing"
424424

425-
def _handle_non_online_instance_status(self, state) -> None:
425+
def _handle_non_online_instance_status(self, state) -> bool: # noqa: C901
426426
"""Helper method to handle non-online instance statuses.
427427
428428
Invoked from the update status event handler.
429429
"""
430430
if state == "recovering":
431431
# server is in the process of becoming an active member
432432
logger.info("Instance is being recovered")
433-
return
433+
return True
434434

435435
if state == "offline":
436436
# Group Replication is active but the member does not belong to any group
@@ -440,17 +440,28 @@ def _handle_non_online_instance_status(self, state) -> None:
440440
all_states.add("offline")
441441

442442
if all_states == {"offline"} and self.unit.is_leader():
443+
loopback_entry_exists = self.hostname_resolution.update_etc_hosts(None)
444+
if loopback_entry_exists and not snap_service_operation(
445+
CHARMED_MYSQL_SNAP_NAME, CHARMED_MYSQLD_SERVICE, "restart"
446+
):
447+
self.unit.status = BlockedStatus(
448+
"Unable to restart mysqld before rebooting from complete outage"
449+
)
450+
return False
451+
452+
self._mysql.wait_until_mysql_connection()
453+
443454
# All instance are off or its a single unit cluster
444455
# reboot cluster from outage from the leader unit
445456
logger.info("Attempting reboot from complete outage.")
446457
try:
447458
# reboot from outage forcing it when it a single unit
448459
self._mysql.reboot_from_complete_outage()
460+
return True
449461
except MySQLRebootFromCompleteOutageError:
450462
logger.error("Failed to reboot cluster from complete outage.")
451463
self.unit.status = BlockedStatus("failed to recover cluster.")
452-
finally:
453-
return
464+
return False
454465

455466
if self._mysql.is_cluster_auto_rejoin_ongoing():
456467
logger.info("Cluster auto-rejoin attempts are still ongoing.")
@@ -466,8 +477,12 @@ def _handle_non_online_instance_status(self, state) -> None:
466477
# mysqld access not possible and daemon restart fails
467478
# force reset necessary
468479
self.unit.status = BlockedStatus("Unable to recover from an unreachable state")
480+
return False
469481
except SnapServiceOperationError as e:
470482
self.unit.status = BlockedStatus(e.message)
483+
return False
484+
485+
return True
471486

472487
def _execute_manual_rejoin(self) -> None:
473488
"""Executes an instance manual rejoin.
@@ -509,7 +524,7 @@ def _on_update_status(self, _) -> None: # noqa: C901
509524
if (
510525
self.unit_peer_data.get("member-state") == "waiting"
511526
and not self.unit_configured
512-
and not self.unit_initialized
527+
and not self.unit_initialized()
513528
and not self.unit.is_leader()
514529
):
515530
# avoid changing status while in initialising
@@ -550,7 +565,8 @@ def _on_update_status(self, _) -> None: # noqa: C901
550565
else MaintenanceStatus(state)
551566
)
552567

553-
self._handle_non_online_instance_status(state)
568+
if not self._handle_non_online_instance_status(state):
569+
return
554570

555571
if self.unit.is_leader():
556572
try:
@@ -768,14 +784,19 @@ def _can_start(self, event: StartEvent) -> bool:
768784
return True
769785

770786
# Safeguard if receiving on start after unit initialization
771-
# with retries to allow time for mysqld startup
772-
for _ in range(6):
773-
if self.unit_initialized:
774-
logger.debug("Delegate status update for start handler on initialized unit.")
775-
self._on_update_status(None)
776-
return False
777-
logger.debug("mysqld not started yet. Retrying check")
778-
sleep(5)
787+
# with retries to allow for mysqld startup
788+
try:
789+
for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(5)):
790+
with attempt:
791+
if self.unit_initialized(raise_exceptions=True):
792+
logger.debug(
793+
"Delegate status update for start handler on initialized unit."
794+
)
795+
self._on_update_status(None)
796+
return False
797+
except RetryError:
798+
event.defer()
799+
return False
779800

780801
return True
781802

@@ -786,7 +807,7 @@ def _is_unit_waiting_to_join_cluster(self) -> bool:
786807
return (
787808
self.unit_peer_data.get("member-state") == "waiting"
788809
and self.unit_configured
789-
and not self.unit_initialized
810+
and not self.unit_initialized()
790811
and self.cluster_initialized
791812
)
792813

@@ -897,7 +918,7 @@ def recover_unit_after_restart(self) -> None:
897918

898919
def _restart(self, _: EventBase) -> None:
899920
"""Restart the service."""
900-
if not self.unit_initialized:
921+
if not self.unit_initialized():
901922
logger.debug("Restarting standalone mysqld")
902923
self._mysql.restart_mysqld()
903924
return

src/flush_mysql_logs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _flush_mysql_logs(self, _) -> None:
4444
"""Flush the specified (via LOGS_TYPE env var) mysql logs."""
4545
if (
4646
self.charm.peers is None
47-
or not self.charm.unit_initialized
47+
or not self.charm.unit_initialized()
4848
or not self.charm.upgrade.idle
4949
or not self.charm._mysql.is_mysqld_running()
5050
):

0 commit comments

Comments
 (0)