Skip to content

Commit 66bfe57

Browse files
authored
fix: DPE-6485 Manage mysqld directly (#575)
* Manage mysqld without mysqld_safe * set behavior in child * refactor to ditch rolling ops event defer * method moved to charm.py * remove `charm` * speed up test * keep try catch to keep RetryError import * enabling sql `RESTART` * fix bump version * 10s fast_forward prevent all units idle at the same time, timing out test * pythonic failure injection method tests consistently fails on CI but work on my machine(tm), so this eliminates any differences from the env
1 parent be3cbce commit 66bfe57

13 files changed

+146
-116
lines changed

lib/charms/mysql/v0/mysql.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
133133
# Increment this major API version when introducing breaking changes
134134
LIBAPI = 0
135135

136-
LIBPATCH = 82
136+
LIBPATCH = 83
137137

138138
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
139139
UNIT_ADD_LOCKNAME = "unit-add"
@@ -1463,7 +1463,11 @@ def get_variable_value(self, variable: str) -> str:
14631463
return rows[0][1]
14641464

14651465
def configure_instance(self, create_cluster_admin: bool = True) -> None:
1466-
"""Configure the instance to be used in an InnoDB cluster."""
1466+
"""Configure the instance to be used in an InnoDB cluster.
1467+
1468+
Args:
1469+
create_cluster_admin: Whether to create the cluster admin user.
1470+
"""
14671471
options = {
14681472
"restart": "true",
14691473
}
@@ -1955,6 +1959,7 @@ def is_instance_in_cluster(self, unit_label: str) -> bool:
19551959
user=self.server_config_user,
19561960
password=self.server_config_password,
19571961
host=self.instance_def(self.server_config_user),
1962+
exception_as_warning=True,
19581963
)
19591964
return (
19601965
MySQLMemberState.ONLINE in output.lower()

src/charm.py

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
WaitingStatus,
6262
)
6363
from ops.pebble import Layer
64+
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
6465

6566
from config import CharmConfig, MySQLConfig
6667
from constants import (
@@ -79,7 +80,8 @@
7980
MYSQLD_CONFIG_FILE,
8081
MYSQLD_EXPORTER_PORT,
8182
MYSQLD_EXPORTER_SERVICE,
82-
MYSQLD_SAFE_SERVICE,
83+
MYSQLD_LOCATION,
84+
MYSQLD_SERVICE,
8385
PASSWORD_LENGTH,
8486
PEER,
8587
ROOT_PASSWORD_KEY,
@@ -225,18 +227,30 @@ def _mysql(self) -> MySQL:
225227
@property
226228
def _pebble_layer(self) -> Layer:
227229
"""Return a layer for the mysqld pebble service."""
230+
mysqld_cmd = [
231+
MYSQLD_LOCATION,
232+
"--basedir=/usr",
233+
"--datadir=/var/lib/mysql",
234+
"--plugin-dir=/usr/lib/mysql/plugin",
235+
"--log-error=/var/log/mysql/error.log",
236+
f"--pid-file={self.unit_label}.pid",
237+
]
238+
228239
layer = {
229240
"summary": "mysqld services layer",
230241
"description": "pebble config layer for mysqld safe and exporter",
231242
"services": {
232-
MYSQLD_SAFE_SERVICE: {
243+
MYSQLD_SERVICE: {
233244
"override": "replace",
234245
"summary": "mysqld safe",
235-
"command": MYSQLD_SAFE_SERVICE,
246+
"command": " ".join(mysqld_cmd),
236247
"startup": "enabled",
237248
"user": MYSQL_SYSTEM_USER,
238249
"group": MYSQL_SYSTEM_GROUP,
239250
"kill-delay": "24h",
251+
"environment": {
252+
"MYSQLD_PARENT_PID": 1,
253+
},
240254
},
241255
MYSQLD_EXPORTER_SERVICE: {
242256
"override": "replace",
@@ -451,7 +465,7 @@ def _reconcile_pebble_layer(self, container: Container) -> None:
451465
if new_layer.services != current_layer.services:
452466
logger.info("Reconciling the pebble layer")
453467

454-
container.add_layer(MYSQLD_SAFE_SERVICE, new_layer, combine=True)
468+
container.add_layer(MYSQLD_SERVICE, new_layer, combine=True)
455469
container.replan()
456470
self._mysql.wait_until_mysql_connection()
457471

@@ -463,34 +477,55 @@ def _reconcile_pebble_layer(self, container: Container) -> None:
463477
):
464478
container.stop(MYSQLD_EXPORTER_SERVICE)
465479

466-
def _restart(self, event: EventBase) -> None:
480+
def recover_unit_after_restart(self) -> None:
481+
"""Wait for unit recovery/rejoin after restart."""
482+
recovery_attempts = 30
483+
logger.info("Recovering unit")
484+
if self.app.planned_units() == 1:
485+
self._mysql.reboot_from_complete_outage()
486+
else:
487+
try:
488+
for attempt in Retrying(
489+
stop=stop_after_attempt(recovery_attempts), wait=wait_fixed(15)
490+
):
491+
with attempt:
492+
self._mysql.hold_if_recovering()
493+
if not self._mysql.is_instance_in_cluster(self.unit_label):
494+
logger.debug(
495+
"Instance not yet back in the cluster."
496+
f" Retry {attempt.retry_state.attempt_number}/{recovery_attempts}"
497+
)
498+
raise Exception
499+
except RetryError:
500+
raise
501+
502+
def _restart(self, _: EventBase) -> None:
467503
"""Restart the service."""
468-
if self.peers.units != self.restart_peers.units:
469-
# defer restart until all units are in the relation
470-
logger.debug("Deferring restart until all units are in the relation")
471-
event.defer()
504+
container = self.unit.get_container(CONTAINER_NAME)
505+
if not container.can_connect():
472506
return
473-
if self.peers.units and self._mysql.is_unit_primary(self.unit_label):
474-
# delay primary on multi units
475-
restart_states = {
476-
self.restart_peers.data[unit].get("state", "unset") for unit in self.peers.units
477-
}
478-
if restart_states == {"unset"}:
479-
logger.info("Restarting primary")
480-
elif restart_states != {"release"}:
481-
# Wait other units restart first to minimize primary switchover
482-
message = "Primary restart deferred after other units"
483-
logger.info(message)
484-
self.unit.status = WaitingStatus(message)
485-
event.defer()
486-
return
507+
508+
if not self.unit_initialized:
509+
logger.debug("Restarting standalone mysqld")
510+
container.restart(MYSQLD_SERVICE)
511+
return
512+
513+
if self.app.planned_units() > 1 and self._mysql.is_unit_primary(self.unit_label):
514+
try:
515+
new_primary = self.get_unit_address(self.peers.units.pop())
516+
logger.debug(f"Switching primary to {new_primary}")
517+
self._mysql.set_cluster_primary(new_primary)
518+
except MySQLSetClusterPrimaryError:
519+
logger.warning("Changing primary failed")
520+
521+
logger.debug("Restarting mysqld")
487522
self.unit.status = MaintenanceStatus("restarting MySQL")
488-
container = self.unit.get_container(CONTAINER_NAME)
489-
if container.can_connect():
490-
logger.debug("Restarting mysqld")
491-
container.pebble.restart_services([MYSQLD_SAFE_SERVICE], timeout=3600)
492-
sleep(10)
493-
self._on_update_status(None)
523+
container.pebble.restart_services([MYSQLD_SERVICE], timeout=3600)
524+
self.unit.status = MaintenanceStatus("recovering unit after restart")
525+
sleep(10)
526+
self.recover_unit_after_restart()
527+
528+
self._on_update_status(None)
494529

495530
# =========================================================================
496531
# Charm event handlers
@@ -647,8 +682,8 @@ def _configure_instance(self, container) -> None:
647682

648683
# Add the pebble layer
649684
logger.info("Adding pebble layer")
650-
container.add_layer(MYSQLD_SAFE_SERVICE, self._pebble_layer, combine=True)
651-
container.restart(MYSQLD_SAFE_SERVICE)
685+
container.add_layer(MYSQLD_SERVICE, self._pebble_layer, combine=True)
686+
container.restart(MYSQLD_SERVICE)
652687

653688
logger.info("Waiting for instance to be ready")
654689
self._mysql.wait_until_mysql_connection(check_port=False)

src/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
PASSWORD_LENGTH = 24
77
PEER = "database-peers"
88
CONTAINER_NAME = "mysql"
9-
MYSQLD_LOCATION = "mysqld"
10-
MYSQLD_SAFE_SERVICE = "mysqld_safe"
9+
MYSQLD_SERVICE = "mysqld"
10+
MYSQLD_LOCATION = f"/usr/sbin/{MYSQLD_SERVICE}"
1111
ROOT_USERNAME = "root"
1212
CLUSTER_ADMIN_USERNAME = "clusteradmin"
1313
SERVER_CONFIG_USERNAME = "serverconfig"

src/mysql_k8s_helpers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
MYSQL_SYSTEM_USER,
4343
MYSQLD_DEFAULTS_CONFIG_FILE,
4444
MYSQLD_LOCATION,
45-
MYSQLD_SAFE_SERVICE,
45+
MYSQLD_SERVICE,
4646
MYSQLD_SOCK_FILE,
4747
MYSQLSH_LOCATION,
4848
ROOT_SYSTEM_USER,
@@ -561,22 +561,22 @@ def stop_mysqld(self) -> None:
561561
"""Stops the mysqld process."""
562562
try:
563563
# call low-level pebble API to access timeout parameter
564-
self.container.pebble.stop_services([MYSQLD_SAFE_SERVICE], timeout=5 * 60)
564+
self.container.pebble.stop_services([MYSQLD_SERVICE], timeout=5 * 60)
565565
except ChangeError:
566-
error_message = f"Failed to stop service {MYSQLD_SAFE_SERVICE}"
566+
error_message = f"Failed to stop service {MYSQLD_SERVICE}"
567567
logger.exception(error_message)
568568
raise MySQLStopMySQLDError(error_message)
569569

570570
def start_mysqld(self) -> None:
571571
"""Starts the mysqld process."""
572572
try:
573-
self.container.start(MYSQLD_SAFE_SERVICE)
573+
self.container.start(MYSQLD_SERVICE)
574574
self.wait_until_mysql_connection()
575575
except (
576576
ChangeError,
577577
MySQLServiceNotRunningError,
578578
):
579-
error_message = f"Failed to start service {MYSQLD_SAFE_SERVICE}"
579+
error_message = f"Failed to start service {MYSQLD_SERVICE}"
580580
logger.exception(error_message)
581581
raise MySQLStartMySQLDError(error_message)
582582

@@ -792,7 +792,7 @@ def check_if_mysqld_process_stopped(self) -> bool:
792792
for line in stdout.strip().split("\n"):
793793
[comm, stat] = line.split()
794794

795-
if comm == MYSQLD_SAFE_SERVICE:
795+
if comm == MYSQLD_SERVICE:
796796
return "T" in stat
797797

798798
return True

src/upgrade.py

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,18 @@
2828
from ops.model import BlockedStatus, MaintenanceStatus, RelationDataContent
2929
from ops.pebble import ChangeError
3030
from pydantic import BaseModel
31-
from tenacity import RetryError, Retrying
32-
from tenacity.stop import stop_after_attempt
33-
from tenacity.wait import wait_fixed
31+
from tenacity import RetryError
3432
from typing_extensions import override
3533

3634
import k8s_helpers
37-
from constants import CONTAINER_NAME, MYSQLD_SAFE_SERVICE
35+
from constants import CONTAINER_NAME, MYSQLD_SERVICE
3836

3937
if TYPE_CHECKING:
4038
from charm import MySQLOperatorCharm
4139

4240
logger = logging.getLogger(__name__)
4341

4442

45-
RECOVER_ATTEMPTS = 10
46-
47-
4843
class MySQLK8sDependenciesModel(BaseModel):
4944
"""MySQL dependencies model."""
5045

@@ -231,10 +226,7 @@ def _on_pebble_ready(self, event) -> None:
231226
self.charm._reconcile_pebble_layer(container)
232227
self._check_server_upgradeability()
233228
self.charm.unit.status = MaintenanceStatus("recovering unit after upgrade")
234-
if self.charm.app.planned_units() > 1:
235-
self._recover_multi_unit_cluster()
236-
else:
237-
self._recover_single_unit_cluster()
229+
self.charm.recover_unit_after_restart()
238230
if self.charm.config.plugin_audit_enabled:
239231
self.charm._mysql.install_plugins(["audit_log"])
240232
self._complete_upgrade()
@@ -269,28 +261,6 @@ def _on_pebble_ready(self, event) -> None:
269261
self._reset_on_unsupported_downgrade(container)
270262
self._complete_upgrade()
271263

272-
def _recover_multi_unit_cluster(self) -> None:
273-
logger.info("Recovering unit")
274-
try:
275-
for attempt in Retrying(
276-
stop=stop_after_attempt(RECOVER_ATTEMPTS), wait=wait_fixed(10)
277-
):
278-
with attempt:
279-
self.charm._mysql.hold_if_recovering()
280-
if not self.charm._mysql.is_instance_in_cluster(self.charm.unit_label):
281-
logger.debug(
282-
"Instance not yet back in the cluster."
283-
f" Retry {attempt.retry_state.attempt_number}/{RECOVER_ATTEMPTS}"
284-
)
285-
raise Exception
286-
except RetryError:
287-
raise
288-
289-
def _recover_single_unit_cluster(self) -> None:
290-
"""Recover single unit cluster."""
291-
logger.debug("Recovering single unit cluster")
292-
self.charm._mysql.reboot_from_complete_outage()
293-
294264
def _complete_upgrade(self):
295265
# complete upgrade for the unit
296266
logger.debug("Upgraded unit is healthy. Set upgrade state to `completed`")
@@ -346,7 +316,7 @@ def _check_server_unsupported_downgrade(self) -> bool:
346316

347317
def _reset_on_unsupported_downgrade(self, container: Container) -> None:
348318
"""Reset the cluster on unsupported downgrade."""
349-
container.stop(MYSQLD_SAFE_SERVICE)
319+
container.stop(MYSQLD_SERVICE)
350320
self.charm._mysql.reset_data_dir()
351321
self.charm._write_mysqld_configuration()
352322
self.charm._configure_instance(container)

tests/integration/helpers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from pytest_operator.plugin import OpsTest
2525
from tenacity import RetryError, Retrying, retry, stop_after_attempt, wait_fixed
2626

27-
from constants import CONTAINER_NAME, MYSQLD_SAFE_SERVICE, SERVER_CONFIG_USERNAME
27+
from constants import CONTAINER_NAME, MYSQLD_SERVICE, SERVER_CONFIG_USERNAME
2828

2929
from . import juju_
3030
from .connector import MySQLConnector
@@ -480,7 +480,7 @@ async def stop_mysqld_service(ops_test: OpsTest, unit_name: str) -> None:
480480
unit_name: The name of the unit
481481
"""
482482
await ops_test.juju(
483-
"ssh", "--container", CONTAINER_NAME, unit_name, "pebble", "stop", MYSQLD_SAFE_SERVICE
483+
"ssh", "--container", CONTAINER_NAME, unit_name, "pebble", "stop", MYSQLD_SERVICE
484484
)
485485

486486

@@ -492,7 +492,7 @@ async def start_mysqld_service(ops_test: OpsTest, unit_name: str) -> None:
492492
unit_name: The name of the unit
493493
"""
494494
await ops_test.juju(
495-
"ssh", "--container", CONTAINER_NAME, unit_name, "pebble", "start", MYSQLD_SAFE_SERVICE
495+
"ssh", "--container", CONTAINER_NAME, unit_name, "pebble", "start", MYSQLD_SERVICE
496496
)
497497

498498

tests/integration/high_availability/high_availability_helpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ async def deploy_and_scale_application(
207207
num_units=1,
208208
channel="latest/edge",
209209
210+
config={"sleep_interval": 300},
210211
)
211212

212213
await ops_test.model.wait_for_idle(
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
services:
2-
mysqld_safe:
2+
mysqld:
33
override: merge
44
backoff-delay: 300s
55
backoff-limit: 300s
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
services:
2-
mysqld_safe:
2+
mysqld:
33
override: merge
44
backoff-delay: 500ms
55
backoff-limit: 30s

0 commit comments

Comments
 (0)