Skip to content

Commit c8e60d4

Browse files
[DPE-4375] Add cluster manual re-join handler (#560)
1 parent 7c6b120 commit c8e60d4

File tree

3 files changed

+119
-2
lines changed

3 files changed

+119
-2
lines changed

lib/charms/mysql/v0/mysql.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
133133
# Increment this major API version when introducing breaking changes
134134
LIBAPI = 0
135135

136-
LIBPATCH = 80
136+
LIBPATCH = 81
137137

138138
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
139139
UNIT_ADD_LOCKNAME = "unit-add"
@@ -2605,6 +2605,31 @@ def lower_or_unknown(value) -> str:
26052605

26062606
raise MySQLNoMemberStateError("No member state retrieved")
26072607

2608+
def is_cluster_auto_rejoin_ongoing(self):
2609+
"""Check if the instance is performing a cluster auto rejoin operation."""
2610+
cluster_auto_rejoin_command = (
2611+
"cursor = session.run_sql(\"SELECT work_completed, work_estimated FROM performance_schema.events_stages_current WHERE event_name LIKE '%auto-rejoin%'\")",
2612+
"result = cursor.fetch_one() or [0,0]",
2613+
"print(f'<COMPLETED_ATTEMPTS>{result[0]}</COMPLETED_ATTEMPTS>')",
2614+
"print(f'<ESTIMATED_ATTEMPTS>{result[1]}</ESTIMATED_ATTEMPTS>')",
2615+
)
2616+
2617+
try:
2618+
output = self._run_mysqlsh_script(
2619+
"\n".join(cluster_auto_rejoin_command),
2620+
user=self.server_config_user,
2621+
password=self.server_config_password,
2622+
host=self.instance_def(self.server_config_user),
2623+
)
2624+
except MySQLClientError as e:
2625+
logger.error("Failed to get cluster auto-rejoin information", exc_info=e)
2626+
raise
2627+
2628+
completed_matches = re.search(r"<COMPLETED_ATTEMPTS>(\d)</COMPLETED_ATTEMPTS>", output)
2629+
estimated_matches = re.search(r"<ESTIMATED_ATTEMPTS>(\d)</ESTIMATED_ATTEMPTS>", output)
2630+
2631+
return int(completed_matches.group(1)) < int(estimated_matches.group(1))
2632+
26082633
def is_cluster_replica(self, from_instance: Optional[str] = None) -> Optional[bool]:
26092634
"""Check if this cluster is a replica in a cluster set."""
26102635
cs_status = self.get_cluster_set_status(extended=0, from_instance=from_instance)

src/charm.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,7 @@ def _on_mysql_pebble_ready(self, event) -> None:
764764
logger.exception("Failed to initialize primary")
765765
raise
766766

767-
def _handle_potential_cluster_crash_scenario(self) -> bool:
767+
def _handle_potential_cluster_crash_scenario(self) -> bool: # noqa: C901
768768
"""Handle potential full cluster crash scenarios.
769769
770770
Returns:
@@ -831,11 +831,43 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
831831
self.unit.status = ActiveStatus(self.active_status_message)
832832
else:
833833
self.unit.status = BlockedStatus("failed to recover cluster.")
834+
finally:
835+
return True
836+
837+
if self._mysql.is_cluster_auto_rejoin_ongoing():
838+
logger.info("Cluster auto-rejoin attempts are still ongoing.")
839+
else:
840+
logger.info("Cluster auto-rejoin attempts are exhausted. Attempting manual rejoin")
841+
self._execute_manual_rejoin()
834842

835843
return True
836844

837845
return False
838846

847+
def _execute_manual_rejoin(self) -> None:
848+
"""Executes an instance manual rejoin.
849+
850+
It is supposed to be called when the MySQL 8.0.21+ auto-rejoin attempts have been exhausted,
851+
on an OFFLINE replica that still belongs to the cluster
852+
"""
853+
if not self._mysql.is_instance_in_cluster(self.unit_label):
854+
logger.warning("Instance does not belong to the cluster. Cannot perform manual rejoin")
855+
return
856+
857+
cluster_primary = self._get_primary_from_online_peer()
858+
if not cluster_primary:
859+
logger.warning("Instance does not have ONLINE peers. Cannot perform manual rejoin")
860+
return
861+
862+
self._mysql.remove_instance(
863+
unit_label=self.unit_label,
864+
)
865+
self._mysql.add_instance_to_cluster(
866+
instance_address=self.unit_address,
867+
instance_unit_label=self.unit_label,
868+
from_instance=cluster_primary,
869+
)
870+
839871
def _is_cluster_blocked(self) -> bool:
840872
"""Performs cluster state checks for the update-status handler.
841873

tests/integration/high_availability/test_self_healing.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
from tenacity import Retrying, stop_after_delay, wait_fixed
1313

1414
from ..helpers import (
15+
execute_queries_on_unit,
1516
get_cluster_status,
1617
get_primary_unit,
1718
get_process_pid,
19+
get_unit_address,
1820
scale_application,
1921
start_mysqld_service,
2022
stop_mysqld_service,
@@ -498,3 +500,61 @@ async def test_single_unit_pod_delete(
498500
mysql_application_substring="mysql-k8s",
499501
)
500502
await clean_up_database_and_table(ops_test, database_name, table_name, credentials)
503+
504+
505+
@pytest.mark.group(7)
506+
@pytest.mark.abort_on_fail
507+
async def test_cluster_manual_rejoin(
508+
ops_test: OpsTest, highly_available_cluster, continuous_writes, credentials
509+
) -> None:
510+
"""The cluster manual re-join test.
511+
512+
A graceful restart is performed in one of the instances (choosing Primary to make it painful).
513+
In order to verify that the instance can come back ONLINE, after disabling automatic re-join
514+
"""
515+
# Ensure continuous writes still incrementing for all units
516+
await ensure_all_units_continuous_writes_incrementing(ops_test, credentials=credentials)
517+
518+
mysql_app_name = get_application_name(ops_test, "mysql")
519+
mysql_units = ops_test.model.applications[mysql_app_name].units
520+
521+
primary_unit = await get_primary_unit(ops_test, mysql_units[0], mysql_app_name)
522+
primary_unit_ip = await get_unit_address(ops_test, primary_unit.name)
523+
524+
queries = [
525+
"SET PERSIST group_replication_autorejoin_tries=0",
526+
]
527+
528+
# Disable automatic re-join procedure
529+
execute_queries_on_unit(
530+
unit_address=primary_unit_ip,
531+
username=credentials["username"],
532+
password=credentials["password"],
533+
queries=queries,
534+
commit=True,
535+
)
536+
537+
logger.info(f"Stopping mysqld on {primary_unit.name}")
538+
await stop_mysqld_service(ops_test, primary_unit.name)
539+
540+
logger.info(f"Wait until mysqld stopped on {primary_unit.name}")
541+
await ensure_process_not_running(
542+
ops_test=ops_test,
543+
unit_name=primary_unit.name,
544+
container_name=MYSQL_CONTAINER_NAME,
545+
process=MYSQLD_PROCESS_NAME,
546+
)
547+
548+
logger.info(f"Starting mysqld on {primary_unit.name}")
549+
await start_mysqld_service(ops_test, primary_unit.name)
550+
551+
# Verify unit comes back active
552+
async with ops_test.fast_forward():
553+
logger.info("Waiting unit to be back online.")
554+
await ops_test.model.block_until(
555+
lambda: primary_unit.workload_status == "active",
556+
timeout=TIMEOUT,
557+
)
558+
559+
# Ensure continuous writes still incrementing for all units
560+
await ensure_all_units_continuous_writes_incrementing(ops_test, credentials=credentials)

0 commit comments

Comments
 (0)