Skip to content

Commit 1072683

Browse files
authored
fix: DPE-6485 optional restart on instance configuration (#607)
* optional restart for pebble managed mysqld * refactor to ditch rolling ops event defer and reuse recovery method * method moved to charm.py * remove `charm` * revert change in method * fix test * fix sed string * pythonic failure injection method tests consistently fails on CI but work on my machine(tm), so this eliminates any differences from the env
1 parent ff14b4d commit 1072683

File tree

6 files changed

+81
-72
lines changed

6 files changed

+81
-72
lines changed

lib/charms/mysql/v0/mysql.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
133133
# Increment this major API version when introducing breaking changes
134134
LIBAPI = 0
135135

136-
LIBPATCH = 82
136+
LIBPATCH = 83
137137

138138
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
139139
UNIT_ADD_LOCKNAME = "unit-add"
@@ -1463,7 +1463,11 @@ def get_variable_value(self, variable: str) -> str:
14631463
return rows[0][1]
14641464

14651465
def configure_instance(self, create_cluster_admin: bool = True) -> None:
1466-
"""Configure the instance to be used in an InnoDB cluster."""
1466+
"""Configure the instance to be used in an InnoDB cluster.
1467+
1468+
Args:
1469+
create_cluster_admin: Whether to create the cluster admin user.
1470+
"""
14671471
options = {
14681472
"restart": "true",
14691473
}
@@ -1955,6 +1959,7 @@ def is_instance_in_cluster(self, unit_label: str) -> bool:
19551959
user=self.server_config_user,
19561960
password=self.server_config_password,
19571961
host=self.instance_def(self.server_config_user),
1962+
exception_as_warning=True,
19581963
)
19591964
return (
19601965
MySQLMemberState.ONLINE in output.lower()

src/charm.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -873,28 +873,50 @@ def join_unit_to_cluster(self) -> None:
873873
self.unit.status = ActiveStatus(self.active_status_message)
874874
logger.info(f"Instance {instance_label} added to cluster")
875875

876-
def _restart(self, event: EventBase) -> None:
877-
"""Restart the MySQL service."""
878-
if self.peers.units != self.restart_peers.units:
879-
# defer restart until all units are in the relation
880-
logger.debug("Deferring restart until all units are in the relation")
881-
event.defer()
876+
def recover_unit_after_restart(self) -> None:
877+
"""Wait for unit recovery/rejoin after restart."""
878+
recovery_attempts = 30
879+
logger.info("Recovering unit")
880+
if self.app.planned_units() == 1:
881+
self._mysql.reboot_from_complete_outage()
882+
else:
883+
try:
884+
for attempt in Retrying(
885+
stop=stop_after_attempt(recovery_attempts), wait=wait_fixed(15)
886+
):
887+
with attempt:
888+
self._mysql.hold_if_recovering()
889+
if not self._mysql.is_instance_in_cluster(self.unit_label):
890+
logger.debug(
891+
"Instance not yet back in the cluster."
892+
f" Retry {attempt.retry_state.attempt_number}/{recovery_attempts}"
893+
)
894+
raise Exception
895+
except RetryError:
896+
raise
897+
898+
def _restart(self, _: EventBase) -> None:
899+
"""Restart the service."""
900+
if not self.unit_initialized:
901+
logger.debug("Restarting standalone mysqld")
902+
self._mysql.restart_mysqld()
882903
return
883-
if self.peers.units and self._mysql.is_unit_primary(self.unit_label):
884-
restart_states = {
885-
self.restart_peers.data[unit].get("state", "unset") for unit in self.peers.units
886-
}
887-
if restart_states == {"unset"}:
888-
logger.debug("Restarting leader")
889-
elif restart_states != {"release"}:
890-
# Wait other units restart first to minimize primary switchover
891-
logger.debug("Primary is waiting for other units to restart")
892-
event.defer()
893-
return
894904

905+
if self.app.planned_units() > 1 and self._mysql.is_unit_primary(self.unit_label):
906+
try:
907+
new_primary = self.get_unit_address(self.peers.units.pop())
908+
logger.debug(f"Switching primary to {new_primary}")
909+
self._mysql.set_cluster_primary(new_primary)
910+
except MySQLSetClusterPrimaryError:
911+
logger.warning("Changing primary failed")
912+
913+
logger.debug("Restarting mysqld")
895914
self.unit.status = MaintenanceStatus("restarting MySQL")
896915
self._mysql.restart_mysqld()
916+
self.unit.status = MaintenanceStatus("recovering unit after restart")
897917
sleep(10)
918+
self.recover_unit_after_restart()
919+
898920
self._on_update_status(None)
899921

900922

src/upgrade.py

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from ops import RelationDataContent
3030
from ops.model import BlockedStatus, MaintenanceStatus, Unit
3131
from pydantic import BaseModel
32-
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
3332
from typing_extensions import override
3433

3534
from constants import CHARMED_MYSQL_COMMON_DIRECTORY
@@ -40,9 +39,6 @@
4039
logger = logging.getLogger(__name__)
4140

4241

43-
RECOVER_ATTEMPTS = 30
44-
45-
4642
class MySQLVMDependenciesModel(BaseModel):
4743
"""MySQL dependencies model."""
4844

@@ -239,10 +235,7 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: # noqa: C901
239235
self.charm.unit.status = MaintenanceStatus("recovering unit after upgrade")
240236

241237
try:
242-
if self.charm.app.planned_units() > 1:
243-
self._recover_multi_unit_cluster()
244-
else:
245-
self._recover_single_unit_cluster()
238+
self.charm.recover_unit_after_restart()
246239

247240
logger.debug("Upgraded unit is healthy. Set upgrade state to `completed`")
248241
self.set_unit_completed()
@@ -257,29 +250,6 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: # noqa: C901
257250
"upgrade failed. Check logs for rollback instruction"
258251
)
259252

260-
def _recover_multi_unit_cluster(self) -> None:
261-
logger.debug("Recovering unit")
262-
try:
263-
for attempt in Retrying(
264-
stop=stop_after_attempt(RECOVER_ATTEMPTS), wait=wait_fixed(10)
265-
):
266-
with attempt:
267-
self.charm._mysql.hold_if_recovering()
268-
if not self.charm._mysql.is_instance_in_cluster(self.charm.unit_label):
269-
logger.debug(
270-
"Instance not yet back in the cluster."
271-
f" Retry {attempt.retry_state.attempt_number}/{RECOVER_ATTEMPTS}"
272-
)
273-
self.charm._mysql.start_group_replication()
274-
raise Exception
275-
except RetryError:
276-
raise
277-
278-
def _recover_single_unit_cluster(self) -> None:
279-
"""Recover single unit cluster."""
280-
logger.debug("Recovering single unit cluster")
281-
self.charm._mysql.reboot_from_complete_outage()
282-
283253
def _on_upgrade_changed(self, _) -> None:
284254
"""Handle the upgrade changed event.
285255

tests/integration/high_availability/test_upgrade_rollback_incompat.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import os
88
import pathlib
99
import shutil
10-
import subprocess
1110
from time import sleep
1211
from zipfile import ZipFile
1312

@@ -53,7 +52,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
5352
with snap_revisions.open("w") as file:
5453
json.dump(old_revisions, file)
5554

56-
async with ops_test.fast_forward("10s"):
55+
async with ops_test.fast_forward("30s"):
5756
await ops_test.model.deploy(
5857
charm,
5958
application_name=MYSQL_APP_NAME,
@@ -68,6 +67,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
6867
channel="latest/edge",
6968
num_units=1,
7069
70+
config={"auto_start_writes": False, "sleep_interval": "500"},
7171
)
7272

7373
await relate_mysql_and_application(ops_test, MYSQL_APP_NAME, TEST_APP)
@@ -106,15 +106,14 @@ async def test_upgrade_to_failling(
106106
await ensure_all_units_continuous_writes_incrementing(ops_test)
107107

108108
application = ops_test.model.applications[MYSQL_APP_NAME]
109-
logger.info("Build charm locally")
110109

111-
sub_regex_failing_rejoin = (
112-
's/logger.debug("Recovering unit")'
113-
"/self.charm._mysql.set_instance_offline_mode(True); raise RetryError/"
114-
)
115-
src_patch(sub_regex=sub_regex_failing_rejoin, file_name="src/upgrade.py")
116-
new_charm = await charm_local_build(ops_test, refresh=True)
117-
src_patch(revert=True)
110+
with InjectFailure(
111+
path="src/upgrade.py",
112+
original_str="self.charm.recover_unit_after_restart()",
113+
replace_str="raise Exception",
114+
):
115+
logger.info("Build charm with failure injected")
116+
new_charm = await charm_local_build(ops_test, refresh=True)
118117

119118
logger.info("Refresh the charm")
120119
await application.refresh(path=new_charm)
@@ -187,15 +186,26 @@ async def test_rollback(ops_test, continuous_writes) -> None:
187186
await ensure_all_units_continuous_writes_incrementing(ops_test)
188187

189188

190-
def src_patch(sub_regex: str = "", file_name: str = "", revert: bool = False) -> None:
191-
"""Apply a patch to the source code."""
192-
if revert:
193-
cmd = "git checkout src/" # revert changes on src/ dir
194-
logger.info("Reverting patch on source")
195-
else:
196-
cmd = f"sed -i -e '{sub_regex}' {file_name}"
197-
logger.info("Applying patch to source")
198-
subprocess.run([cmd], shell=True, check=True)
189+
class InjectFailure(object):
190+
def __init__(self, path: str, original_str: str, replace_str: str):
191+
self.path = path
192+
self.original_str = original_str
193+
self.replace_str = replace_str
194+
with open(path, "r") as file:
195+
self.original_content = file.read()
196+
197+
def __enter__(self):
198+
logger.info("Injecting failure")
199+
assert self.original_str in self.original_content, "replace content not found"
200+
new_content = self.original_content.replace(self.original_str, self.replace_str)
201+
assert self.original_str not in new_content, "original string not replaced"
202+
with open(self.path, "w") as file:
203+
file.write(new_content)
204+
205+
def __exit__(self, exc_type, exc_value, traceback):
206+
logger.info("Reverting failure")
207+
with open(self.path, "w") as file:
208+
file.write(self.original_content)
199209

200210

201211
async def charm_local_build(ops_test: OpsTest, refresh: bool = False):

tests/unit/test_mysql.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,7 @@ def test_is_instance_in_cluster(self, _run_mysqlsh_script, _cluster_metadata_exi
842842
user="serverconfig",
843843
password="serverconfigpassword",
844844
host="127.0.0.1:33062",
845+
exception_as_warning=True,
845846
)
846847

847848
_run_mysqlsh_script.return_value = "NOT_A_MEMBER"

tests/unit/test_upgrade.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
MySQLStopMySQLDError,
1414
)
1515
from ops.testing import Harness
16+
from tenacity import RetryError
1617

1718
from charm import MySQLOperatorCharm
1819

@@ -131,13 +132,13 @@ def test_pre_upgrade_prepare(
131132
mock_get_primary_label.assert_called_once()
132133
assert mock_set_dynamic_variable.call_count == 2
133134

135+
@patch("charm.MySQLOperatorCharm.recover_unit_after_restart")
134136
@patch("mysql_vm_helpers.MySQL.install_plugins")
135137
@patch("upgrade.set_cron_daemon")
136138
@patch("upgrade.MySQLVMUpgrade._check_server_unsupported_downgrade")
137139
@patch("upgrade.MySQLVMUpgrade._reset_on_unsupported_downgrade")
138140
@patch("mysql_vm_helpers.MySQL.hold_if_recovering")
139141
@patch("pathlib.Path.exists", return_value=True)
140-
@patch("upgrade.RECOVER_ATTEMPTS", 1)
141142
@patch("mysql_vm_helpers.MySQL.get_mysql_version", return_value="8.0.33")
142143
@patch("charm.MySQLOperatorCharm.install_workload", return_value=True)
143144
@patch("charm.MySQLOperatorCharm.unit_fqdn", return_value="10.0.1.1")
@@ -162,6 +163,7 @@ def test_upgrade_granted(
162163
mock_check_server_unsupported_downgrade,
163164
mock_set_cron_daemon,
164165
mock_install_plugins,
166+
mock_recover_unit_after_restart,
165167
):
166168
"""Test upgrade-granted hook."""
167169
self.charm.on.config_changed.emit()
@@ -173,7 +175,6 @@ def test_upgrade_granted(
173175
self.harness.get_relation_data(self.upgrade_relation_id, "mysql/1")["state"],
174176
"idle", # change to `completed` - behavior not yet set in the lib
175177
)
176-
mock_is_instance_in_cluster.assert_called_once()
177178
mock_check_server_upgradeability.assert_called_once()
178179
mock_start_mysqld.assert_called_once()
179180
mock_stop_mysqld.assert_called_once()
@@ -185,7 +186,7 @@ def test_upgrade_granted(
185186
self.upgrade_relation_id, "mysql/0", {"state": "upgrading"}
186187
)
187188
# setup for exception
188-
mock_is_instance_in_cluster.return_value = False
189+
mock_recover_unit_after_restart.side_effect = RetryError
189190

190191
self.charm.upgrade._on_upgrade_granted(None)
191192
self.assertEqual(

0 commit comments

Comments
 (0)