canonical
diff --git a/‎actions.yaml
Lines changed: 13 additions & 3 deletions b/‎actions.yaml
Lines changed: 13 additions & 3 deletions
diff --git a/‎lib/charms/mysql/v0/async_replication.py
Lines changed: 3 additions & 0 deletions b/‎lib/charms/mysql/v0/async_replication.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/charms/mysql/v0/mysql.py
Lines changed: 161 additions & 4 deletions b/‎lib/charms/mysql/v0/mysql.py
Lines changed: 161 additions & 4 deletions
@@ -27,6 +27,7 @@ set-password:
       type: string
       description: The username, the default value 'root'.
         Possible values - root, serverconfig, clusteradmin.
+      enum: [root, serverconfig, clusteradmin]
     password:
       type: string
       description: The password will be auto-generated if this option is not specified.
@@ -74,15 +75,24 @@ create-replication:
 
 promote-to-primary:
   description: |
-    Promotes this cluster to become the primary in the cluster-set. Used for safe switchover or failover.
-    Can only be run against the charm leader unit of a standby cluster.
+    Promotes the unit or cluster to become the primary in the cluster or cluster-set, depending on
+    the scope (unit or cluster). Used for safe switchover or failover.
+    When in cluster scope, can only be run against the charm leader unit of a standby cluster.
   params:
+    scope:
+      type: string
+      description: Whether to promote a unit or a cluster. Must be set to either `unit` or `cluster`.
+      enum: [unit, cluster]
     force:
       type: boolean
       default: False
       description: |
-        Use force when previous primary is unreachable (failover). Will invalidate previous
+        For cluster scope, use force when previous primary is unreachable (failover). Will invalidate previous
         primary.
+        For unit scope, use force to force quorum from the current unit. Note that this operation is DANGEROUS
+        as it can create a split-brain if incorrectly used and should be considered a last resort. Make
+        absolutely sure that there are no partitions of this group that are still operating somewhere in
+        the network, but not accessible from your location
 
 recreate-cluster:
   description: |
 
@@ -148,6 +148,9 @@ def remote_relation_data(self) -> Optional[RelationDataContent]:
 
     def _on_promote_to_primary(self, event: ActionEvent) -> None:
         """Promote a standby cluster to primary."""
+        if event.params.get("scope") != "cluster":
+            return
+
         if not self._charm.unit.is_leader():
             event.fail("Only the leader unit can promote a standby cluster")
             return
 
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
 # Increment this major API version when introducing breaking changes
 LIBAPI = 0
 
-LIBPATCH = 89
+LIBPATCH = 90
 
 UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
 UNIT_ADD_LOCKNAME = "unit-add"
@@ -230,6 +230,10 @@ class MySQLAddInstanceToClusterError(Error):
     """Exception raised when there is an issue add an instance to the MySQL InnoDB cluster."""
 
 
+class MySQLRejoinInstanceToClusterError(Error):
+    """Exception raised when there is an issue rejoining an instance to the MySQL InnoDB cluster."""
+
+
 class MySQLRemoveInstanceRetryError(Error):
     """Exception raised when there is an issue removing an instance.
 
@@ -291,6 +295,10 @@ class MySQLRebootFromCompleteOutageError(Error):
     """Exception raised when there is an issue rebooting from complete outage."""
 
 
+class MySQLForceQuorumFromInstanceError(Error):
+    """Exception raised when there is an issue forcing quorum from an instance."""
+
+
 class MySQLSetInstanceOfflineModeError(Error):
     """Exception raised when there is an issue setting instance as offline."""
 
@@ -476,7 +484,11 @@ def __init__(self, *args):
         self.framework.observe(self.on.get_cluster_status_action, self._get_cluster_status)
         self.framework.observe(self.on.get_password_action, self._on_get_password)
         self.framework.observe(self.on.set_password_action, self._on_set_password)
+        self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
         self.framework.observe(self.on.recreate_cluster_action, self._recreate_cluster)
+        self.framework.observe(
+            self.on[PEER].relation_changed, self.check_topology_timestamp_change
+        )
 
         # Set in some event handlers in order to avoid passing event down a chain
         # of methods
@@ -582,6 +594,43 @@ def _get_cluster_status(self, event: ActionEvent) -> None:
                 "message": "Failed to read cluster status.  See logs for more information.",
             })
 
+    def _on_promote_to_primary(self, event: ActionEvent) -> None:
+        """Action for setting this unit as the cluster primary."""
+        if event.params.get("scope") != "unit":
+            return
+
+        if self._mysql.is_unit_primary(self.unit_label):
+            event.set_results({
+                "success": False,
+                "message": "Unit is already primary",
+            })
+            return
+
+        if event.params.get("force"):
+            # Failover
+            logger.info("Forcing quorum from instance")
+            try:
+                self._mysql.force_quorum_from_instance()
+            except MySQLForceQuorumFromInstanceError:
+                logger.exception("Failed to force quorum from instance")
+                event.fail("Failed to force quorum from instance. See logs for more information.")
+        else:
+            # Switchover
+            logger.info("Setting unit as cluster primary")
+            try:
+                self._mysql.set_cluster_primary(self.get_unit_hostname())
+            except MySQLSetClusterPrimaryError:
+                logger.exception("Failed to set cluster primary")
+                event.fail("Failed to change cluster primary. See logs for more information.")
+
+        # Use peer relation to trigger endpoint update
+        # refer to mysql_provider.py
+        self.unit_peer_data.update({"topology-change-timestamp": str(int(time.time()))})
+        event.set_results({
+            "success": True,
+            "message": "Unit is already primary",
+        })
+
     def _recreate_cluster(self, event: ActionEvent) -> None:
         """Action used to recreate the cluster, for special cases."""
         if not self.unit.is_leader():
@@ -623,6 +672,37 @@ def create_cluster(self) -> None:
 
         self.unit_peer_data.update({"member-state": state, "member-role": role})
 
+    @abstractmethod
+    def update_endpoints(self) -> None:
+        """Update the endpoints for the cluster."""
+        raise NotImplementedError
+
+    def check_topology_timestamp_change(self, _) -> None:
+        """Check for cluster topology changes and trigger endpoint update if needed.
+
+        Used for trigger endpoint updates for non typical events like, add/remove unit
+        or update status.
+        """
+        topology_change_set = {
+            int(self.peers.data[unit]["topology-change-timestamp"])
+            for unit in self.peers.units
+            if self.peers.data[unit].get("topology-change-timestamp")
+        }
+        if not topology_change_set:
+            # no topology change detected
+            return
+        topology_change = int(self.unit_peer_data.get("topology-change-timestamp", "0"))
+        max_topology_change = max(topology_change_set)
+        if self.unit.is_leader() and max_topology_change > topology_change:
+            # update endpoints required
+            self.update_endpoints()
+            return
+
+        # sync timestamp and trigger relation changed
+        self.unit_peer_data.update({
+            "topology-change-timestamp": str(max(max_topology_change, topology_change))
+        })
+
     @property
     def peers(self) -> Optional[ops.model.Relation]:
         """Retrieve the peer relation."""
@@ -1941,6 +2021,27 @@ def add_instance_to_cluster(
             # always release the lock
             self._release_lock(local_lock_instance, instance_unit_label, UNIT_ADD_LOCKNAME)
 
+    def rejoin_instance_to_cluster(self, *, unit_label: str, from_instance: str) -> None:
+        """Rejoin an instance to the InnoDB cluster."""
+        commands = (
+            f"cluster = dba.get_cluster('{self.cluster_name}')",
+            f"cluster.rejoin_instance('{unit_label}')",
+        )
+
+        from_instance = from_instance or self.instance_address
+
+        try:
+            logger.debug(f"Rejoining instance {unit_label} to cluster {self.cluster_name}")
+            self._run_mysqlsh_script(
+                "\n".join(commands),
+                user=self.server_config_user,
+                password=self.server_config_password,
+                host=self.instance_def(self.server_config_user, from_instance),
+            )
+        except MySQLClientError:
+            logger.error(f"Failed to rejoin instance {unit_label} to cluster {self.cluster_name}")
+            raise MySQLRejoinInstanceToClusterError
+
     def is_instance_configured_for_innodb(
         self, instance_address: str, instance_unit_label: str
     ) -> bool:
@@ -2069,6 +2170,31 @@ def is_instance_in_cluster(self, unit_label: str) -> bool:
             )
             return False
 
+    def instance_belongs_to_cluster(self, unit_label: str) -> bool:
+        """Check if instance belongs to cluster independently of current state.
+
+        Args:
+            unit_label: The label of the unit to check.
+        """
+        query = (
+            "SELECT instance_id FROM mysql_innodb_cluster_metadata.instances WHERE cluster_id ="
+            "(SELECT cluster_id FROM mysql_innodb_cluster_metadata.clusters WHERE cluster_name ="
+            f" '{self.cluster_name}') AND instance_name = '{unit_label}';",
+        )
+
+        try:
+            output = self._run_mysqlcli_script(
+                query,
+                user=self.server_config_user,
+                password=self.server_config_password,
+            )
+        except MySQLClientError:
+            logger.debug(
+                "Instance has no cluster metadata, assuming it does not belong to any cluster."
+            )
+            return False
+        return len(output) == 1
+
     @retry(
         wait=wait_fixed(2),
         stop=stop_after_attempt(3),
@@ -2218,14 +2344,23 @@ def execute_remove_instance(
         wait=wait_random(min=4, max=30),
     )
     def remove_instance(  # noqa: C901
-        self, unit_label: str, lock_instance: Optional[str] = None
+        self,
+        unit_label: str,
+        lock_instance: Optional[str] = None,
+        auto_dissolve: Optional[bool] = True,
     ) -> None:
         """Remove instance from the cluster.
 
         This method is called from each unit being torn down, thus we must obtain
         locks on the cluster primary. There is a retry mechanism for any issues
         obtaining the lock, removing instances/dissolving the cluster, or releasing
         the lock.
+
+        Args:
+            unit_label: The label of the unit to remove.
+            lock_instance: (optional) The instance address to acquire the lock on.
+            auto_dissolve: (optional) Whether to automatically dissolve the cluster
+                if this is the last instance in the cluster.
         """
         remaining_cluster_member_addresses = []
         skip_release_lock = False
@@ -2263,7 +2398,8 @@ def remove_instance(  # noqa: C901
                     self.remove_replica_cluster(self.cluster_name)
                 else:
                     skip_release_lock = True
-                self.dissolve_cluster()
+                if auto_dissolve:
+                    self.dissolve_cluster()
 
             else:
                 # Get remaining cluster member addresses before calling mysqlsh.remove_instance()
@@ -2314,7 +2450,7 @@ def remove_instance(  # noqa: C901
 
     def dissolve_cluster(self) -> None:
         """Dissolve the cluster independently of the unit teardown process."""
-        logger.debug(f"Dissolving cluster {self.cluster_name}")
+        logger.info(f"Dissolving cluster {self.cluster_name}")
         dissolve_cluster_commands = (
             f"cluster = dba.get_cluster('{self.cluster_name}')",
             "cluster.dissolve({'force': 'true'})",
@@ -2745,6 +2881,27 @@ def start_group_replication(self) -> None:
         except MySQLClientError:
             logger.warning("Failed to start Group Replication for unit")
 
+    def force_quorum_from_instance(self) -> None:
+        """Force quorum from the current instance.
+
+        Recovery for cases where majority loss put the cluster in defunct state.
+        """
+        force_quorum_command = (
+            f"cluster = dba.get_cluster('{self.cluster_name}')",
+            "cluster.force_quorum_using_partition_of()",
+        )
+
+        try:
+            self._run_mysqlsh_script(
+                "\n".join(force_quorum_command),
+                user=self.server_config_user,
+                password=self.server_config_password,
+                host=self.instance_def(self.server_config_user),
+            )
+        except MySQLClientError:
+            logger.error("Failed to force quorum from instance")
+            raise MySQLForceQuorumFromInstanceError
+
     def reboot_from_complete_outage(self) -> None:
         """Wrapper for reboot_cluster_from_complete_outage command."""
         reboot_from_outage_command = (