Recover from pod restarts during cluster creation during setup (#499)

shayancanonical · web-flow · commit 2009919583ed · 2024-09-16T14:14:16.000-04:00
* Recover from pod restarts during cluster creation during setup

* Commit missed file after format

* Re-add unnecessarily removed code

* Only run create_cluster in update_status if all other units are waiting and current unit is offline

* Address PR feedback

* Fix failing scale down and then scale up integration test

* Add comments to explain measures to recover from pod reschedule during cluster creation

* Pull latest charm libs for mysql and tracing

* Pull in latest mysql backups charm lib
diff --git a/lib/charms/mysql/v0/backups.py b/lib/charms/mysql/v0/backups.py
@@ -60,9 +60,9 @@ def is_unit_blocked(self) -> bool:
     MySQLDeleteTempRestoreDirectoryError,
     MySQLEmptyDataDirectoryError,
     MySQLExecuteBackupCommandsError,
-    MySQLGetMemberStateError,
     MySQLInitializeJujuOperationsTableError,
     MySQLKillSessionError,
+    MySQLNoMemberStateError,
     MySQLOfflineModeAndHiddenInstanceExistsError,
     MySQLPrepareBackupForRestoreError,
     MySQLRescanClusterError,
@@ -73,6 +73,7 @@ def is_unit_blocked(self) -> bool:
     MySQLSetInstanceOptionError,
     MySQLStartMySQLDError,
     MySQLStopMySQLDError,
+    MySQLUnableToGetMemberStateError,
 )
 from charms.mysql.v0.s3_helpers import (
     fetch_and_check_existence_of_s3_path,
@@ -99,7 +100,7 @@ def is_unit_blocked(self) -> bool:
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 11
+LIBPATCH = 12
 
 
 if typing.TYPE_CHECKING:
@@ -339,7 +340,7 @@ def _can_unit_perform_backup(self) -> Tuple[bool, Optional[str]]:
 
         try:
             state, role = self.charm._mysql.get_member_state()
-        except MySQLGetMemberStateError:
+        except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
             return False, "Error obtaining member state"
 
         if role == "primary" and self.charm.app.planned_units() > 1:
diff --git a/lib/charms/mysql/v0/mysql.py b/lib/charms/mysql/v0/mysql.py
@@ -134,7 +134,7 @@ def wait_until_mysql_connection(self) -> None:
 # Increment this major API version when introducing breaking changes
 LIBAPI = 0
 
-LIBPATCH = 72
+LIBPATCH = 73
 
 UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
 UNIT_ADD_LOCKNAME = "unit-add"
@@ -276,8 +276,12 @@ class MySQLGrantPrivilegesToUserError(Error):
     """Exception raised when there is an issue granting privileges to user."""
 
 
-class MySQLGetMemberStateError(Error):
-    """Exception raised when there is an issue getting member state."""
+class MySQLNoMemberStateError(Error):
+    """Exception raised when there is no member state."""
+
+
+class MySQLUnableToGetMemberStateError(Error):
+    """Exception raised when unable to get member state."""
 
 
 class MySQLGetClusterEndpointsError(Error):
@@ -620,6 +624,26 @@ def cluster_initialized(self) -> bool:
 
         return False
 
+    @property
+    def only_one_cluster_node_thats_uninitialized(self) -> Optional[bool]:
+        """Check if only a single cluster node exists across all units."""
+        if not self.app_peer_data.get("cluster-name"):
+            return None
+
+        total_cluster_nodes = 0
+        for unit in self.app_units:
+            total_cluster_nodes += self._mysql.get_cluster_node_count(
+                from_instance=self.get_unit_address(unit)
+            )
+
+        total_online_cluster_nodes = 0
+        for unit in self.app_units:
+            total_online_cluster_nodes += self._mysql.get_cluster_node_count(
+                from_instance=self.get_unit_address(unit), node_status=MySQLMemberState["ONLINE"]
+            )
+
+        return total_cluster_nodes == 1 and total_online_cluster_nodes == 0
+
     @property
     def cluster_fully_initialized(self) -> bool:
         """Returns True if the cluster is fully initialized.
@@ -1728,6 +1752,18 @@ def is_instance_configured_for_innodb(
             )
             return False
 
+    def drop_group_replication_metadata_schema(self) -> None:
+        """Drop the group replication metadata schema from current unit."""
+        commands = (
+            f"shell.connect('{self.instance_def(self.server_config_user)}')",
+            "dba.drop_metadata_schema()",
+        )
+
+        try:
+            self._run_mysqlsh_script("\n".join(commands))
+        except MySQLClientError:
+            logger.exception("Failed to drop group replication metadata schema")
+
     def are_locks_acquired(self, from_instance: Optional[str] = None) -> bool:
         """Report if any topology change is being executed."""
         commands = (
@@ -2356,13 +2392,13 @@ def get_member_state(self) -> Tuple[str, str]:
             logger.error(
                 "Failed to get member state: mysqld daemon is down",
             )
-            raise MySQLGetMemberStateError(e.message)
+            raise MySQLUnableToGetMemberStateError(e.message)
 
         # output is like:
         # 'MEMBER_STATE\tMEMBER_ROLE\tMEMBER_ID\t@@server_uuid\nONLINE\tPRIMARY\t<uuid>\t<uuid>\n'
         lines = output.strip().lower().split("\n")
         if len(lines) < 2:
-            raise MySQLGetMemberStateError("No member state retrieved")
+            raise MySQLNoMemberStateError("No member state retrieved")
 
         if len(lines) == 2:
             # Instance just know it own state
@@ -2378,7 +2414,7 @@ def get_member_state(self) -> Tuple[str, str]:
                 # filter server uuid
                 return results[0], results[1] or "unknown"
 
-        raise MySQLGetMemberStateError("No member state retrieved")
+        raise MySQLNoMemberStateError("No member state retrieved")
 
     def is_cluster_replica(self, from_instance: Optional[str] = None) -> Optional[bool]:
         """Check if this cluster is a replica in a cluster set."""
@@ -2435,7 +2471,7 @@ def hold_if_recovering(self) -> None:
         while True:
             try:
                 member_state, _ = self.get_member_state()
-            except MySQLGetMemberStateError:
+            except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
                 break
             if member_state == MySQLMemberState.RECOVERING:
                 logger.debug("Unit is recovering")
diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py
@@ -97,7 +97,7 @@ def __init__(self, *args):
 )
 from ops.framework import EventSource, Object
 from ops.model import ModelError, Relation
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, Field
 
 # The unique Charmhub library identifier, never change it
 LIBID = "12977e9aa0b34367903d8afeb8c3d85d"
@@ -107,7 +107,7 @@ def __init__(self, *args):
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 8
+LIBPATCH = 9
 
 PYDEPS = ["pydantic"]
 
@@ -338,7 +338,7 @@ class Config:
     class ProtocolType(BaseModel):
         """Protocol Type."""
 
-        model_config = ConfigDict(
+        model_config = ConfigDict(  # type: ignore
             # Allow serializing enum values.
             use_enum_values=True
         )
@@ -925,7 +925,7 @@ def get_endpoint(
 def charm_tracing_config(
     endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]]
 ) -> Tuple[Optional[str], Optional[str]]:
-    """Utility function to determine the charm_tracing config you will likely want.
+    """Return the charm_tracing config you likely want.
 
     If no endpoint is provided:
      disable charm tracing.
diff --git a/src/charm.py b/src/charm.py
@@ -28,13 +28,14 @@
     MySQLConfigureMySQLUsersError,
     MySQLCreateClusterError,
     MySQLGetClusterPrimaryAddressError,
-    MySQLGetMemberStateError,
     MySQLGetMySQLVersionError,
     MySQLInitializeJujuOperationsTableError,
     MySQLLockAcquisitionError,
+    MySQLNoMemberStateError,
     MySQLRebootFromCompleteOutageError,
     MySQLServiceNotRunningError,
     MySQLSetClusterPrimaryError,
+    MySQLUnableToGetMemberStateError,
 )
 from charms.mysql.v0.tls import MySQLTLS
 from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
@@ -700,7 +701,12 @@ def _on_mysql_pebble_ready(self, event) -> None:
         # First run setup
         self._configure_instance(container)
 
-        if not self.unit.is_leader() or self.cluster_initialized:
+        # We consider cluster initialized only if a primary already exists
+        # (as there can be metadata in the database but no primary if pod
+        # crashes while cluster is being created)
+        if not self.unit.is_leader() or (
+            self.cluster_initialized and self._get_primary_from_online_peer()
+        ):
             # Non-leader units try to join cluster
             self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
             self.unit_peer_data.update({"member-role": "secondary", "member-state": "waiting"})
@@ -710,12 +716,14 @@ def _on_mysql_pebble_ready(self, event) -> None:
         try:
             # Create the cluster when is the leader unit
             logger.info(f"Creating cluster {self.app_peer_data['cluster-name']}")
+            self.unit.status = MaintenanceStatus("Creating cluster")
             self.create_cluster()
             self.unit.status = ops.ActiveStatus(self.active_status_message)
 
         except (
             MySQLCreateClusterError,
-            MySQLGetMemberStateError,
+            MySQLUnableToGetMemberStateError,
+            MySQLNoMemberStateError,
             MySQLInitializeJujuOperationsTableError,
             MySQLCreateClusterError,
         ):
@@ -728,19 +736,24 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
         Returns:
             bool indicating whether the caller should return
         """
-        if not self.cluster_initialized or not self.unit_peer_data.get("member-role"):
-            # health checks are only after cluster and members are initialized
+        if not self._mysql.is_mysqld_running():
             return True
 
-        if not self._mysql.is_mysqld_running():
+        only_single_unitialized_node_across_cluster = (
+            self.only_one_cluster_node_thats_uninitialized
+        )
+
+        if (
+            not self.cluster_initialized and not only_single_unitialized_node_across_cluster
+        ) or not self.unit_peer_data.get("member-role"):
             return True
 
         # retrieve and persist state for every unit
         try:
             state, role = self._mysql.get_member_state()
             self.unit_peer_data["member-state"] = state
             self.unit_peer_data["member-role"] = role
-        except MySQLGetMemberStateError:
+        except (MySQLNoMemberStateError, MySQLUnableToGetMemberStateError):
             logger.error("Error getting member state. Avoiding potential cluster crash recovery")
             self.unit.status = MaintenanceStatus("Unable to get member state")
             return True
@@ -757,23 +770,33 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
         if state == "recovering":
             return True
 
-        if state in ["offline"]:
+        if state == "offline":
             # Group Replication is active but the member does not belong to any group
             all_states = {
                 self.peers.data[unit].get("member-state", "unknown") for unit in self.peers.units
             }
-            # Add state for this unit (self.peers.units does not include this unit)
-            all_states.add("offline")
 
-            if all_states == {"offline"} and self.unit.is_leader():
+            # Add state 'offline' for this unit (self.peers.unit does not
+            # include this unit)
+            if (all_states | {"offline"} == {"offline"} and self.unit.is_leader()) or (
+                only_single_unitialized_node_across_cluster and all_states == {"waiting"}
+            ):
                 # All instance are off, reboot cluster from outage from the leader unit
 
                 logger.info("Attempting reboot from complete outage.")
                 try:
-                    self._mysql.reboot_from_complete_outage()
+                    # Need condition to avoid rebooting on all units of application
+                    if self.unit.is_leader() or only_single_unitialized_node_across_cluster:
+                        self._mysql.reboot_from_complete_outage()
                 except MySQLRebootFromCompleteOutageError:
                     logger.error("Failed to reboot cluster from complete outage.")
-                    self.unit.status = BlockedStatus("failed to recover cluster.")
+
+                    if only_single_unitialized_node_across_cluster and all_states == {"waiting"}:
+                        self._mysql.drop_group_replication_metadata_schema()
+                        self.create_cluster()
+                        self.unit.status = ActiveStatus(self.active_status_message)
+                    else:
+                        self.unit.status = BlockedStatus("failed to recover cluster.")
 
             return True
 
@@ -785,10 +808,23 @@ def _is_cluster_blocked(self) -> bool:
         Returns: a boolean indicating whether the update-status (caller) should
             no-op and return.
         """
-        unit_member_state = self.unit_peer_data.get("member-state")
-        if unit_member_state in ["waiting", "restarting"]:
+        # We need to query member state from the server since member state would
+        # be 'offline' if pod rescheduled during cluster creation, however
+        # member-state in the unit peer databag will be 'waiting'
+        member_state_exists = True
+        try:
+            member_state, _ = self._mysql.get_member_state()
+        except MySQLUnableToGetMemberStateError:
+            logger.error("Error getting member state while checking if cluster is blocked")
+            self.unit.status = MaintenanceStatus("Unable to get member state")
+            return True
+        except MySQLNoMemberStateError:
+            member_state_exists = False
+
+        if not member_state_exists or member_state == "restarting":
             # avoid changing status while tls is being set up or charm is being initialized
-            logger.info(f"Unit state is {unit_member_state}")
+            logger.info("Unit is waiting or restarting")
+            logger.debug(f"{member_state_exists=}, {member_state=}")
             return True
 
         # avoid changing status while async replication is setting up
@@ -812,6 +848,7 @@ def _on_update_status(self, _: Optional[UpdateStatusEvent]) -> None:
 
         container = self.unit.get_container(CONTAINER_NAME)
         if not container.can_connect():
+            logger.debug("Cannot connect to pebble in the mysql container")
             return
 
         if self._handle_potential_cluster_crash_scenario():
diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
@@ -581,7 +581,7 @@ async def write_content_to_file_in_unit(
         )
 
 
-async def read_contents_from_file_in_unit(
+def read_contents_from_file_in_unit(
     ops_test: OpsTest, unit: Unit, path: str, container_name: str = CONTAINER_NAME
 ) -> str:
     """Read contents from file in the provided unit.
diff --git a/tests/integration/high_availability/high_availability_helpers.py b/tests/integration/high_availability/high_availability_helpers.py
@@ -653,3 +653,19 @@ def delete_pvcs(pvcs: list[PersistentVolumeClaim]) -> None:
             namespace=pvc.metadata.namespace,
             grace_period=0,
         )
+
+
+def delete_pod(ops_test: OpsTest, unit: Unit) -> None:
+    """Delete the provided pod."""
+    pod_name = unit.name.replace("/", "-")
+    subprocess.run(
+        [
+            "microk8s.kubectl",
+            "-n",
+            ops_test.model.info.name,
+            "delete",
+            "pod",
+            pod_name,
+        ],
+        check=True,
+    )
diff --git a/tests/integration/high_availability/test_crash_during_setup.py b/tests/integration/high_availability/test_crash_during_setup.py
diff --git a/tests/integration/high_availability/test_log_rotation.py b/tests/integration/high_availability/test_log_rotation.py

Original file line number	Diff line number	Diff line change
`@@ -581,7 +581,7 @@ async def write_content_to_file_in_unit(`
`581`	`581`	`)`
`582`	`582`
`583`	`583`
`584`		`-async def read_contents_from_file_in_unit(`
	`584`	`+def read_contents_from_file_in_unit(`
`585`	`585`	`ops_test: OpsTest, unit: Unit, path: str, container_name: str = CONTAINER_NAME`
`586`	`586`	`) -> str:`
`587`	`587`	`"""Read contents from file in the provided unit.`