[DPE-4375] Add cluster manual re-join handler (#560)

sinclert-canonical · web-flow · commit c8e60d4b4dd9 · 2025-01-27T09:29:02.000+01:00
diff --git a/lib/charms/mysql/v0/mysql.py b/lib/charms/mysql/v0/mysql.py
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
 # Increment this major API version when introducing breaking changes
 LIBAPI = 0
 
-LIBPATCH = 80
+LIBPATCH = 81
 
 UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
 UNIT_ADD_LOCKNAME = "unit-add"
@@ -2605,6 +2605,31 @@ def lower_or_unknown(value) -> str:
 
         raise MySQLNoMemberStateError("No member state retrieved")
 
+    def is_cluster_auto_rejoin_ongoing(self):
+        """Check if the instance is performing a cluster auto rejoin operation."""
+        cluster_auto_rejoin_command = (
+            "cursor = session.run_sql(\"SELECT work_completed, work_estimated FROM performance_schema.events_stages_current WHERE event_name LIKE '%auto-rejoin%'\")",
+            "result = cursor.fetch_one() or [0,0]",
+            "print(f'<COMPLETED_ATTEMPTS>{result[0]}</COMPLETED_ATTEMPTS>')",
+            "print(f'<ESTIMATED_ATTEMPTS>{result[1]}</ESTIMATED_ATTEMPTS>')",
+        )
+
+        try:
+            output = self._run_mysqlsh_script(
+                "\n".join(cluster_auto_rejoin_command),
+                user=self.server_config_user,
+                password=self.server_config_password,
+                host=self.instance_def(self.server_config_user),
+            )
+        except MySQLClientError as e:
+            logger.error("Failed to get cluster auto-rejoin information", exc_info=e)
+            raise
+
+        completed_matches = re.search(r"<COMPLETED_ATTEMPTS>(\d)</COMPLETED_ATTEMPTS>", output)
+        estimated_matches = re.search(r"<ESTIMATED_ATTEMPTS>(\d)</ESTIMATED_ATTEMPTS>", output)
+
+        return int(completed_matches.group(1)) < int(estimated_matches.group(1))
+
     def is_cluster_replica(self, from_instance: Optional[str] = None) -> Optional[bool]:
         """Check if this cluster is a replica in a cluster set."""
         cs_status = self.get_cluster_set_status(extended=0, from_instance=from_instance)
diff --git a/src/charm.py b/src/charm.py
@@ -764,7 +764,7 @@ def _on_mysql_pebble_ready(self, event) -> None:
             logger.exception("Failed to initialize primary")
             raise
 
-    def _handle_potential_cluster_crash_scenario(self) -> bool:
+    def _handle_potential_cluster_crash_scenario(self) -> bool:  # noqa: C901
         """Handle potential full cluster crash scenarios.
 
         Returns:
@@ -831,11 +831,43 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
                         self.unit.status = ActiveStatus(self.active_status_message)
                     else:
                         self.unit.status = BlockedStatus("failed to recover cluster.")
+                finally:
+                    return True
+
+            if self._mysql.is_cluster_auto_rejoin_ongoing():
+                logger.info("Cluster auto-rejoin attempts are still ongoing.")
+            else:
+                logger.info("Cluster auto-rejoin attempts are exhausted. Attempting manual rejoin")
+                self._execute_manual_rejoin()
 
             return True
 
         return False
 
+    def _execute_manual_rejoin(self) -> None:
+        """Executes an instance manual rejoin.
+
+        It is supposed to be called when the MySQL 8.0.21+ auto-rejoin attempts have been exhausted,
+        on an OFFLINE replica that still belongs to the cluster
+        """
+        if not self._mysql.is_instance_in_cluster(self.unit_label):
+            logger.warning("Instance does not belong to the cluster. Cannot perform manual rejoin")
+            return
+
+        cluster_primary = self._get_primary_from_online_peer()
+        if not cluster_primary:
+            logger.warning("Instance does not have ONLINE peers. Cannot perform manual rejoin")
+            return
+
+        self._mysql.remove_instance(
+            unit_label=self.unit_label,
+        )
+        self._mysql.add_instance_to_cluster(
+            instance_address=self.unit_address,
+            instance_unit_label=self.unit_label,
+            from_instance=cluster_primary,
+        )
+
     def _is_cluster_blocked(self) -> bool:
         """Performs cluster state checks for the update-status handler.
 
diff --git a/tests/integration/high_availability/test_self_healing.py b/tests/integration/high_availability/test_self_healing.py
@@ -12,9 +12,11 @@
 from tenacity import Retrying, stop_after_delay, wait_fixed
 
 from ..helpers import (
+    execute_queries_on_unit,
     get_cluster_status,
     get_primary_unit,
     get_process_pid,
+    get_unit_address,
     scale_application,
     start_mysqld_service,
     stop_mysqld_service,
@@ -498,3 +500,61 @@ async def test_single_unit_pod_delete(
         mysql_application_substring="mysql-k8s",
     )
     await clean_up_database_and_table(ops_test, database_name, table_name, credentials)
+
+
+@pytest.mark.group(7)
+@pytest.mark.abort_on_fail
+async def test_cluster_manual_rejoin(
+    ops_test: OpsTest, highly_available_cluster, continuous_writes, credentials
+) -> None:
+    """The cluster manual re-join test.
+
+    A graceful restart is performed in one of the instances (choosing Primary to make it painful).
+    In order to verify that the instance can come back ONLINE, after disabling automatic re-join
+    """
+    # Ensure continuous writes still incrementing for all units
+    await ensure_all_units_continuous_writes_incrementing(ops_test, credentials=credentials)
+
+    mysql_app_name = get_application_name(ops_test, "mysql")
+    mysql_units = ops_test.model.applications[mysql_app_name].units
+
+    primary_unit = await get_primary_unit(ops_test, mysql_units[0], mysql_app_name)
+    primary_unit_ip = await get_unit_address(ops_test, primary_unit.name)
+
+    queries = [
+        "SET PERSIST group_replication_autorejoin_tries=0",
+    ]
+
+    # Disable automatic re-join procedure
+    execute_queries_on_unit(
+        unit_address=primary_unit_ip,
+        username=credentials["username"],
+        password=credentials["password"],
+        queries=queries,
+        commit=True,
+    )
+
+    logger.info(f"Stopping mysqld on {primary_unit.name}")
+    await stop_mysqld_service(ops_test, primary_unit.name)
+
+    logger.info(f"Wait until mysqld stopped on {primary_unit.name}")
+    await ensure_process_not_running(
+        ops_test=ops_test,
+        unit_name=primary_unit.name,
+        container_name=MYSQL_CONTAINER_NAME,
+        process=MYSQLD_PROCESS_NAME,
+    )
+
+    logger.info(f"Starting mysqld on {primary_unit.name}")
+    await start_mysqld_service(ops_test, primary_unit.name)
+
+    # Verify unit comes back active
+    async with ops_test.fast_forward():
+        logger.info("Waiting unit to be back online.")
+        await ops_test.model.block_until(
+            lambda: primary_unit.workload_status == "active",
+            timeout=TIMEOUT,
+        )
+
+    # Ensure continuous writes still incrementing for all units
+    await ensure_all_units_continuous_writes_incrementing(ops_test, credentials=credentials)