Skip to content

Commit 2009919

Browse files
Recover from pod restarts during cluster creation during setup (#499)
* Recover from pod restarts during cluster creation during setup * Commit missed file after format * Re-add unnecessarily removed code * Only run create_cluster in update_status if all other units are waiting and current unit is offline * Address PR feedback * Fix failing scale down and then scale up integration test * Add comments to explain measures to recover from pod reschedule during cluster creation * Pull latest charm libs for mysql and tracing * Pull in latest mysql backups charm lib
1 parent dfdd0bd commit 2009919

File tree

8 files changed

+224
-33
lines changed

8 files changed

+224
-33
lines changed

lib/charms/mysql/v0/backups.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ def is_unit_blocked(self) -> bool:
6060
MySQLDeleteTempRestoreDirectoryError,
6161
MySQLEmptyDataDirectoryError,
6262
MySQLExecuteBackupCommandsError,
63-
MySQLGetMemberStateError,
6463
MySQLInitializeJujuOperationsTableError,
6564
MySQLKillSessionError,
65+
MySQLNoMemberStateError,
6666
MySQLOfflineModeAndHiddenInstanceExistsError,
6767
MySQLPrepareBackupForRestoreError,
6868
MySQLRescanClusterError,
@@ -73,6 +73,7 @@ def is_unit_blocked(self) -> bool:
7373
MySQLSetInstanceOptionError,
7474
MySQLStartMySQLDError,
7575
MySQLStopMySQLDError,
76+
MySQLUnableToGetMemberStateError,
7677
)
7778
from charms.mysql.v0.s3_helpers import (
7879
fetch_and_check_existence_of_s3_path,
@@ -99,7 +100,7 @@ def is_unit_blocked(self) -> bool:
99100

100101
# Increment this PATCH version before using `charmcraft publish-lib` or reset
101102
# to 0 if you are raising the major API version
102-
LIBPATCH = 11
103+
LIBPATCH = 12
103104

104105

105106
if typing.TYPE_CHECKING:
@@ -339,7 +340,7 @@ def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]:
339340

340341
try:
341342
state, role = self.charm._mysql.get_member_state()
342-
except MySQLGetMemberStateError:
343+
except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
343344
return False, "Error obtaining member state"
344345

345346
if role == "primary" and self.charm.app.planned_units() > 1:

lib/charms/mysql/v0/mysql.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def wait_until_mysql_connection(self) -> None:
134134
# Increment this major API version when introducing breaking changes
135135
LIBAPI = 0
136136

137-
LIBPATCH = 72
137+
LIBPATCH = 73
138138

139139
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
140140
UNIT_ADD_LOCKNAME = "unit-add"
@@ -276,8 +276,12 @@ class MySQLGrantPrivilegesToUserError(Error):
276276
"""Exception raised when there is an issue granting privileges to user."""
277277

278278

279-
class MySQLGetMemberStateError(Error):
280-
"""Exception raised when there is an issue getting member state."""
279+
class MySQLNoMemberStateError(Error):
280+
"""Exception raised when there is no member state."""
281+
282+
283+
class MySQLUnableToGetMemberStateError(Error):
284+
"""Exception raised when unable to get member state."""
281285

282286

283287
class MySQLGetClusterEndpointsError(Error):
@@ -620,6 +624,26 @@ def cluster_initialized(self) -> bool:
620624

621625
return False
622626

627+
@property
628+
def only_one_cluster_node_thats_uninitialized(self) -> Optional[bool]:
629+
"""Check if only a single cluster node exists across all units."""
630+
if not self.app_peer_data.get("cluster-name"):
631+
return None
632+
633+
total_cluster_nodes = 0
634+
for unit in self.app_units:
635+
total_cluster_nodes += self._mysql.get_cluster_node_count(
636+
from_instance=self.get_unit_address(unit)
637+
)
638+
639+
total_online_cluster_nodes = 0
640+
for unit in self.app_units:
641+
total_online_cluster_nodes += self._mysql.get_cluster_node_count(
642+
from_instance=self.get_unit_address(unit), node_status=MySQLMemberState["ONLINE"]
643+
)
644+
645+
return total_cluster_nodes == 1 and total_online_cluster_nodes == 0
646+
623647
@property
624648
def cluster_fully_initialized(self) -> bool:
625649
"""Returns True if the cluster is fully initialized.
@@ -1728,6 +1752,18 @@ def is_instance_configured_for_innodb(
17281752
)
17291753
return False
17301754

1755+
def drop_group_replication_metadata_schema(self) -> None:
1756+
"""Drop the group replication metadata schema from current unit."""
1757+
commands = (
1758+
f"shell.connect('{self.instance_def(self.server_config_user)}')",
1759+
"dba.drop_metadata_schema()",
1760+
)
1761+
1762+
try:
1763+
self._run_mysqlsh_script("\n".join(commands))
1764+
except MySQLClientError:
1765+
logger.exception("Failed to drop group replication metadata schema")
1766+
17311767
def are_locks_acquired(self, from_instance: Optional[str] = None) -> bool:
17321768
"""Report if any topology change is being executed."""
17331769
commands = (
@@ -2356,13 +2392,13 @@ def get_member_state(self) -> Tuple[str, str]:
23562392
logger.error(
23572393
"Failed to get member state: mysqld daemon is down",
23582394
)
2359-
raise MySQLGetMemberStateError(e.message)
2395+
raise MySQLUnableToGetMemberStateError(e.message)
23602396

23612397
# output is like:
23622398
# 'MEMBER_STATE\tMEMBER_ROLE\tMEMBER_ID\t@@server_uuid\nONLINE\tPRIMARY\t<uuid>\t<uuid>\n'
23632399
lines = output.strip().lower().split("\n")
23642400
if len(lines) < 2:
2365-
raise MySQLGetMemberStateError("No member state retrieved")
2401+
raise MySQLNoMemberStateError("No member state retrieved")
23662402

23672403
if len(lines) == 2:
23682404
# Instance just know it own state
@@ -2378,7 +2414,7 @@ def get_member_state(self) -> Tuple[str, str]:
23782414
# filter server uuid
23792415
return results[0], results[1] or "unknown"
23802416

2381-
raise MySQLGetMemberStateError("No member state retrieved")
2417+
raise MySQLNoMemberStateError("No member state retrieved")
23822418

23832419
def is_cluster_replica(self, from_instance: Optional[str] = None) -> Optional[bool]:
23842420
"""Check if this cluster is a replica in a cluster set."""
@@ -2435,7 +2471,7 @@ def hold_if_recovering(self) -> None:
24352471
while True:
24362472
try:
24372473
member_state, _ = self.get_member_state()
2438-
except MySQLGetMemberStateError:
2474+
except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
24392475
break
24402476
if member_state == MySQLMemberState.RECOVERING:
24412477
logger.debug("Unit is recovering")

lib/charms/tempo_k8s/v2/tracing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def __init__(self, *args):
9797
)
9898
from ops.framework import EventSource, Object
9999
from ops.model import ModelError, Relation
100-
from pydantic import BaseModel, ConfigDict, Field
100+
from pydantic import BaseModel, Field
101101

102102
# The unique Charmhub library identifier, never change it
103103
LIBID = "12977e9aa0b34367903d8afeb8c3d85d"
@@ -107,7 +107,7 @@ def __init__(self, *args):
107107

108108
# Increment this PATCH version before using `charmcraft publish-lib` or reset
109109
# to 0 if you are raising the major API version
110-
LIBPATCH = 8
110+
LIBPATCH = 9
111111

112112
PYDEPS = ["pydantic"]
113113

@@ -338,7 +338,7 @@ class Config:
338338
class ProtocolType(BaseModel):
339339
"""Protocol Type."""
340340

341-
model_config = ConfigDict(
341+
model_config = ConfigDict( # type: ignore
342342
# Allow serializing enum values.
343343
use_enum_values=True
344344
)
@@ -925,7 +925,7 @@ def get_endpoint(
925925
def charm_tracing_config(
926926
endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]]
927927
) -> Tuple[Optional[str], Optional[str]]:
928-
"""Utility function to determine the charm_tracing config you will likely want.
928+
"""Return the charm_tracing config you likely want.
929929
930930
If no endpoint is provided:
931931
disable charm tracing.

src/charm.py

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@
2828
MySQLConfigureMySQLUsersError,
2929
MySQLCreateClusterError,
3030
MySQLGetClusterPrimaryAddressError,
31-
MySQLGetMemberStateError,
3231
MySQLGetMySQLVersionError,
3332
MySQLInitializeJujuOperationsTableError,
3433
MySQLLockAcquisitionError,
34+
MySQLNoMemberStateError,
3535
MySQLRebootFromCompleteOutageError,
3636
MySQLServiceNotRunningError,
3737
MySQLSetClusterPrimaryError,
38+
MySQLUnableToGetMemberStateError,
3839
)
3940
from charms.mysql.v0.tls import MySQLTLS
4041
from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
@@ -700,7 +701,12 @@ def _on_mysql_pebble_ready(self, event) -> None:
700701
# First run setup
701702
self._configure_instance(container)
702703

703-
if not self.unit.is_leader() or self.cluster_initialized:
704+
# We consider cluster initialized only if a primary already exists
705+
# (as there can be metadata in the database but no primary if pod
706+
# crashes while cluster is being created)
707+
if not self.unit.is_leader() or (
708+
self.cluster_initialized and self._get_primary_from_online_peer()
709+
):
704710
# Non-leader units try to join cluster
705711
self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
706712
self.unit_peer_data.update({"member-role": "secondary", "member-state": "waiting"})
@@ -710,12 +716,14 @@ def _on_mysql_pebble_ready(self, event) -> None:
710716
try:
711717
# Create the cluster when is the leader unit
712718
logger.info(f"Creating cluster {self.app_peer_data['cluster-name']}")
719+
self.unit.status = MaintenanceStatus("Creating cluster")
713720
self.create_cluster()
714721
self.unit.status = ops.ActiveStatus(self.active_status_message)
715722

716723
except (
717724
MySQLCreateClusterError,
718-
MySQLGetMemberStateError,
725+
MySQLUnableToGetMemberStateError,
726+
MySQLNoMemberStateError,
719727
MySQLInitializeJujuOperationsTableError,
720728
MySQLCreateClusterError,
721729
):
@@ -728,19 +736,24 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
728736
Returns:
729737
bool indicating whether the caller should return
730738
"""
731-
if not self.cluster_initialized or not self.unit_peer_data.get("member-role"):
732-
# health checks are only after cluster and members are initialized
739+
if not self._mysql.is_mysqld_running():
733740
return True
734741

735-
if not self._mysql.is_mysqld_running():
742+
only_single_unitialized_node_across_cluster = (
743+
self.only_one_cluster_node_thats_uninitialized
744+
)
745+
746+
if (
747+
not self.cluster_initialized and not only_single_unitialized_node_across_cluster
748+
) or not self.unit_peer_data.get("member-role"):
736749
return True
737750

738751
# retrieve and persist state for every unit
739752
try:
740753
state, role = self._mysql.get_member_state()
741754
self.unit_peer_data["member-state"] = state
742755
self.unit_peer_data["member-role"] = role
743-
except MySQLGetMemberStateError:
756+
except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
744757
logger.error("Error getting member state. Avoiding potential cluster crash recovery")
745758
self.unit.status = MaintenanceStatus("Unable to get member state")
746759
return True
@@ -757,23 +770,33 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
757770
if state == "recovering":
758771
return True
759772

760-
if state in ["offline"]:
773+
if state == "offline":
761774
# Group Replication is active but the member does not belong to any group
762775
all_states = {
763776
self.peers.data[unit].get("member-state", "unknown") for unit in self.peers.units
764777
}
765-
# Add state for this unit (self.peers.units does not include this unit)
766-
all_states.add("offline")
767778

768-
if all_states == {"offline"} and self.unit.is_leader():
779+
# Add state 'offline' for this unit (self.peers.unit does not
780+
# include this unit)
781+
if (all_states | {"offline"} == {"offline"} and self.unit.is_leader()) or (
782+
only_single_unitialized_node_across_cluster and all_states == {"waiting"}
783+
):
769784
# All instance are off, reboot cluster from outage from the leader unit
770785

771786
logger.info("Attempting reboot from complete outage.")
772787
try:
773-
self._mysql.reboot_from_complete_outage()
788+
# Need condition to avoid rebooting on all units of application
789+
if self.unit.is_leader() or only_single_unitialized_node_across_cluster:
790+
self._mysql.reboot_from_complete_outage()
774791
except MySQLRebootFromCompleteOutageError:
775792
logger.error("Failed to reboot cluster from complete outage.")
776-
self.unit.status = BlockedStatus("failed to recover cluster.")
793+
794+
if only_single_unitialized_node_across_cluster and all_states == {"waiting"}:
795+
self._mysql.drop_group_replication_metadata_schema()
796+
self.create_cluster()
797+
self.unit.status = ActiveStatus(self.active_status_message)
798+
else:
799+
self.unit.status = BlockedStatus("failed to recover cluster.")
777800

778801
return True
779802

@@ -785,10 +808,23 @@ def _is_cluster_blocked(self) -> bool:
785808
Returns: a boolean indicating whether the update-status (caller) should
786809
no-op and return.
787810
"""
788-
unit_member_state = self.unit_peer_data.get("member-state")
789-
if unit_member_state in ["waiting", "restarting"]:
811+
# We need to query member state from the server since member state would
812+
# be 'offline' if pod rescheduled during cluster creation, however
813+
# member-state in the unit peer databag will be 'waiting'
814+
member_state_exists = True
815+
try:
816+
member_state, _ = self._mysql.get_member_state()
817+
except MySQLUnableToGetMemberStateError:
818+
logger.error("Error getting member state while checking if cluster is blocked")
819+
self.unit.status = MaintenanceStatus("Unable to get member state")
820+
return True
821+
except MySQLNoMemberStateError:
822+
member_state_exists = False
823+
824+
if not member_state_exists or member_state == "restarting":
790825
# avoid changing status while tls is being set up or charm is being initialized
791-
logger.info(f"Unit state is {unit_member_state}")
826+
logger.info("Unit is waiting or restarting")
827+
logger.debug(f"{member_state_exists=}, {member_state=}")
792828
return True
793829

794830
# avoid changing status while async replication is setting up
@@ -812,6 +848,7 @@ def _on_update_status(self, _: Optional[UpdateStatusEvent]) -> None:
812848

813849
container = self.unit.get_container(CONTAINER_NAME)
814850
if not container.can_connect():
851+
logger.debug("Cannot connect to pebble in the mysql container")
815852
return
816853

817854
if self._handle_potential_cluster_crash_scenario():

tests/integration/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ async def write_content_to_file_in_unit(
581581
)
582582

583583

584-
async def read_contents_from_file_in_unit(
584+
def read_contents_from_file_in_unit(
585585
ops_test: OpsTest, unit: Unit, path: str, container_name: str = CONTAINER_NAME
586586
) -> str:
587587
"""Read contents from file in the provided unit.

tests/integration/high_availability/high_availability_helpers.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,3 +653,19 @@ def delete_pvcs(pvcs: list[PersistentVolumeClaim]) -> None:
653653
namespace=pvc.metadata.namespace,
654654
grace_period=0,
655655
)
656+
657+
658+
def delete_pod(ops_test: OpsTest, unit: Unit) -> None:
659+
"""Delete the provided pod."""
660+
pod_name = unit.name.replace("/", "-")
661+
subprocess.run(
662+
[
663+
"microk8s.kubectl",
664+
"-n",
665+
ops_test.model.info.name,
666+
"delete",
667+
"pod",
668+
pod_name,
669+
],
670+
check=True,
671+
)

0 commit comments

Comments
 (0)