Skip to content

Commit d6efbab

Browse files
authored
fix: [DPE-7404] quorum loss recovery and test fixes (#671)
* serialize rejoin and fix test * Fix unit test * avoid changing unit state and set app status * typo * test for lock after delay
1 parent 57441ed commit d6efbab

File tree

5 files changed

+73
-20
lines changed

5 files changed

+73
-20
lines changed

lib/charms/mysql/v0/mysql.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def wait_until_mysql_connection(self) -> None:
127127
# Increment this major API version when introducing breaking changes
128128
LIBAPI = 0
129129

130-
LIBPATCH = 91
130+
LIBPATCH = 92
131131

132132
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
133133
UNIT_ADD_LOCKNAME = "unit-add"
@@ -2025,26 +2025,47 @@ def add_instance_to_cluster(
20252025
# always release the lock
20262026
self._release_lock(local_lock_instance, instance_unit_label, UNIT_ADD_LOCKNAME)
20272027

2028-
def rejoin_instance_to_cluster(self, *, unit_label: str, from_instance: str) -> None:
2029-
"""Rejoin an instance to the InnoDB cluster."""
2028+
def rejoin_instance_to_cluster(
2029+
self, *, unit_address: str, unit_label: str, from_instance: str
2030+
) -> None:
2031+
"""Rejoin an instance to the InnoDB cluster.
2032+
2033+
Args:
2034+
unit_address: The address of the unit to rejoin.
2035+
unit_label: The label of the unit to rejoin.
2036+
from_instance: The instance from which to rejoin the cluster.
2037+
"""
2038+
options = {"password": self.server_config_password}
20302039
commands = (
20312040
f"cluster = dba.get_cluster('{self.cluster_name}')",
2032-
f"cluster.rejoin_instance('{unit_label}')",
2041+
f"cluster.rejoin_instance('{self.instance_def(self.server_config_user, unit_address)}',"
2042+
f"{options})",
20332043
)
20342044

20352045
from_instance = from_instance or self.instance_address
2046+
if not self._acquire_lock(
2047+
from_instance,
2048+
unit_label,
2049+
UNIT_ADD_LOCKNAME,
2050+
):
2051+
raise MySQLLockAcquisitionError("Lock not acquired")
20362052

20372053
try:
2038-
logger.debug(f"Rejoining instance {unit_label} to cluster {self.cluster_name}")
2054+
logger.debug(f"Rejoining instance {unit_address} to cluster {self.cluster_name}")
20392055
self._run_mysqlsh_script(
20402056
"\n".join(commands),
20412057
user=self.server_config_user,
20422058
password=self.server_config_password,
20432059
host=self.instance_def(self.server_config_user, from_instance),
20442060
)
20452061
except MySQLClientError as e:
2046-
logger.error(f"Failed to rejoin instance {unit_label} to cluster {self.cluster_name}")
2062+
logger.error(
2063+
f"Failed to rejoin instance {unit_address} to cluster {self.cluster_name}"
2064+
)
20472065
raise MySQLRejoinInstanceToClusterError from e
2066+
finally:
2067+
# always release the lock
2068+
self._release_lock(from_instance, unit_label, UNIT_ADD_LOCKNAME)
20482069

20492070
def is_instance_configured_for_innodb(
20502071
self, instance_address: str, instance_unit_label: str
@@ -2892,17 +2913,19 @@ def force_quorum_from_instance(self) -> None:
28922913
28932914
Recovery for cases where majority loss put the cluster in defunct state.
28942915
"""
2916+
instance_definition = self.instance_def(self.server_config_user)
28952917
force_quorum_command = (
28962918
f"cluster = dba.get_cluster('{self.cluster_name}')",
2897-
"cluster.force_quorum_using_partition_of()",
2919+
f"cluster.force_quorum_using_partition_of('{self.server_config_user}@"
2920+
f"{instance_definition}','{self.server_config_password}')",
28982921
)
28992922

29002923
try:
29012924
self._run_mysqlsh_script(
29022925
"\n".join(force_quorum_command),
29032926
user=self.server_config_user,
29042927
password=self.server_config_password,
2905-
host=self.instance_def(self.server_config_user),
2928+
host=instance_definition,
29062929
)
29072930
except MySQLClientError as e:
29082931
logger.error("Failed to force quorum from instance")

src/charm.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -507,9 +507,19 @@ def _execute_manual_rejoin(self) -> None:
507507
logger.warning("Instance does not have ONLINE peers. Cannot perform manual rejoin")
508508
return
509509

510+
# add random delay to mitigate collisions when multiple units are rejoining
511+
# due the difference between the time we test for locks and acquire them
512+
# Not used for cryptographic purpose
513+
sleep(random.uniform(0, 1.5)) # noqa: S311
514+
515+
if self._mysql.are_locks_acquired(from_instance=cluster_primary):
516+
logger.info("waiting: cluster lock is held")
517+
return
510518
try:
511519
self._mysql.rejoin_instance_to_cluster(
512-
unit_label=self.unit_label, from_instance=cluster_primary
520+
unit_address=self.unit_fqdn,
521+
unit_label=self.unit_label,
522+
from_instance=cluster_primary,
513523
)
514524
return
515525
except MySQLRejoinInstanceToClusterError:
@@ -585,15 +595,15 @@ def _on_update_status(self, _) -> None: # noqa: C901
585595
if not self._handle_non_online_instance_status(state):
586596
return
587597

588-
if self.unit.is_leader():
598+
if self.unit.is_leader() and state == "online":
589599
try:
590600
primary_address = self._mysql.get_cluster_primary_address()
591601
except MySQLGetClusterPrimaryAddressError:
592-
self.unit.status = MaintenanceStatus("Unable to query cluster primary")
593-
return
602+
primary_address = None
594603

595604
if not primary_address:
596-
self.unit.status = MaintenanceStatus("Unable to find cluster primary")
605+
logger.error("Cluster has no primary. Check cluster status on online units.")
606+
self.app.status = MaintenanceStatus("Cluster has no primary.")
597607
return
598608

599609
if "s3-block-message" in self.app_peer_data:
@@ -927,7 +937,7 @@ def join_unit_to_cluster(self) -> None:
927937

928938
# add random delay to mitigate collisions when multiple units are joining
929939
# due the difference between the time we test for locks and acquire them
930-
# Not used for cryptographic puropse
940+
# Not used for cryptographic purpose
931941
sleep(random.uniform(0, 1.5)) # noqa: S311
932942

933943
if self._mysql.are_locks_acquired(from_instance=lock_instance or cluster_primary):

tests/integration/high_availability/high_availability_helpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ async def deploy_and_scale_application(ops_test: OpsTest) -> str:
176176
num_units=1,
177177
channel="latest/edge",
178178
179+
config={"sleep_interval": "500"},
179180
)
180181

181182
await ops_test.model.wait_for_idle(

tests/integration/high_availability/test_primary_switchover.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from ..markers import juju3
1111

12+
logging.getLogger("jubilant.wait").setLevel(logging.WARNING)
13+
1214

1315
@juju3
1416
@pytest.mark.abort_on_fail
@@ -56,13 +58,30 @@ def test_cluster_failover_after_majority_loss(juju: Juju, highly_available_clust
5658

5759
logging.info(f"Unit selected for promotion: {unit_to_promote}")
5860

59-
logging.info("Rebooting all but one unit to simulate majority loss...")
60-
for unit in [non_primary_units.pop(), primary_unit]:
61-
machine_name = get_unit_machine(juju, app_name, unit)
62-
run(["lxc", "restart", machine_name], check=True)
61+
logging.info("Kill all but one unit to simulate majority loss...")
62+
units_to_kill = [non_primary_units.pop(), primary_unit]
63+
machine_name = []
64+
for unit in units_to_kill:
65+
machine_name.append(get_unit_machine(juju, app_name, unit))
66+
67+
run(["lxc", "restart", "--force", machine_name[0], machine_name[1]], check=True)
68+
69+
juju.model_config({"update-status-hook-interval": "45s"})
70+
logging.info("Waiting to settle in error state")
71+
juju.wait(
72+
lambda status: status.apps[app_name].units[unit_to_promote].workload_status.current
73+
== "active"
74+
and status.apps[app_name].units[units_to_kill[0]].workload_status.message == "offline"
75+
and status.apps[app_name].units[units_to_kill[1]].workload_status.message == "offline",
76+
timeout=60 * 15,
77+
delay=15,
78+
)
6379

6480
failover_task = juju.run(
65-
unit_to_promote, "promote-to-primary", {"scope": "unit", "force": True}
81+
unit_to_promote,
82+
"promote-to-primary",
83+
{"scope": "unit", "force": True},
84+
wait=600,
6685
)
6786

6887
juju.model_config({"update-status-hook-interval": "15s"})

tests/unit/test_charm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def test_on_update(
310310
_get_member_state.assert_called_once()
311311
_reboot_from_complete_outage.assert_called_once()
312312
_snap_service_operation.assert_called()
313-
_get_cluster_primary_address.assert_called_once()
313+
_get_cluster_primary_address.assert_not_called()
314314

315315
self.assertTrue(isinstance(self.harness.model.unit.status, MaintenanceStatus))
316316
# test instance state = unreachable

0 commit comments

Comments
 (0)