Skip to content

Commit e5b0f94

Browse files
authored
DPE-3547 mitigations for container restart (#377)
* optimizations for container churn mitigation * set floor for max_connections in 100 * function retries * flush logs in single call * + test coverage * fix floor value * switch order * fix typo on constant * using peek to get secret content
1 parent 2194db9 commit e5b0f94

File tree

11 files changed

+167
-80
lines changed

11 files changed

+167
-80
lines changed

lib/charms/mysql/v0/mysql.py

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def wait_until_mysql_connection(self) -> None:
111111

112112
# Increment this PATCH version before using `charmcraft publish-lib` or reset
113113
# to 0 if you are raising the major API version
114-
LIBPATCH = 54
114+
LIBPATCH = 55
115115

116116
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
117117
UNIT_ADD_LOCKNAME = "unit-add"
@@ -122,6 +122,7 @@ def wait_until_mysql_connection(self) -> None:
122122
BYTES_1MiB = 1048576 # 1 mebibyte
123123
RECOVERY_CHECK_TIME = 10 # seconds
124124
GET_MEMBER_STATE_TIME = 10 # seconds
125+
MIN_MAX_CONNECTIONS = 100
125126

126127
SECRET_INTERNAL_LABEL = "secret-id"
127128
SECRET_DELETED_LABEL = "None"
@@ -710,7 +711,7 @@ def render_mysqld_configuration(
710711
innodb_buffer_pool_size = 20 * BYTES_1MiB
711712
innodb_buffer_pool_chunk_size = 1 * BYTES_1MiB
712713
group_replication_message_cache_size = 128 * BYTES_1MiB
713-
max_connections = 20
714+
max_connections = MIN_MAX_CONNECTIONS
714715
performance_schema_instrument = "'memory/%=OFF'"
715716
else:
716717
available_memory = self.get_available_memory()
@@ -723,7 +724,7 @@ def render_mysqld_configuration(
723724
innodb_buffer_pool_chunk_size,
724725
group_replication_message_cache_size,
725726
) = self.get_innodb_buffer_pool_parameters(available_memory)
726-
max_connections = self.get_max_connections(available_memory)
727+
max_connections = max(self.get_max_connections(available_memory), MIN_MAX_CONNECTIONS)
727728
if available_memory < 2 * BYTES_1GiB:
728729
# disable memory instruments if we have less than 2GiB of RAM
729730
performance_schema_instrument = "'memory/%=OFF'"
@@ -1209,7 +1210,11 @@ def initialize_juju_units_operations_table(self) -> None:
12091210
raise MySQLInitializeJujuOperationsTableError(e.message)
12101211

12111212
def add_instance_to_cluster(
1212-
self, instance_address: str, instance_unit_label: str, from_instance: Optional[str] = None
1213+
self,
1214+
instance_address: str,
1215+
instance_unit_label: str,
1216+
from_instance: Optional[str] = None,
1217+
method: str = "auto",
12131218
) -> None:
12141219
"""Add an instance to the InnoDB cluster.
12151220
@@ -1223,6 +1228,7 @@ def add_instance_to_cluster(
12231228
instance_address: address of the instance to add to the cluster
12241229
instance_unit_label: the label/name of the unit
12251230
from_instance: address of the adding instance, e.g. primary
1231+
method: recovery method to use, either "auto" or "clone"
12261232
"""
12271233
options = {
12281234
"password": self.cluster_admin_password,
@@ -1243,39 +1249,37 @@ def add_instance_to_cluster(
12431249
"shell.options['dba.restartWaitTimeout'] = 3600",
12441250
)
12451251

1246-
for recovery_method in ["auto", "clone"]:
1247-
# Prefer "auto" recovery method, but if it fails, try "clone"
1248-
try:
1249-
options["recoveryMethod"] = recovery_method
1250-
add_instance_command = (
1251-
f"cluster.add_instance('{self.cluster_admin_user}@{instance_address}', {json.dumps(options)})",
1252-
)
1252+
# Prefer "auto" recovery method, but if it fails, try "clone"
1253+
try:
1254+
options["recoveryMethod"] = method
1255+
add_instance_command = (
1256+
f"cluster.add_instance('{self.cluster_admin_user}@{instance_address}', {options})",
1257+
)
12531258

1254-
logger.debug(
1255-
f"Adding instance {instance_address}/{instance_unit_label} to cluster {self.cluster_name} with recovery method {recovery_method}"
1256-
)
1257-
self._run_mysqlsh_script("\n".join(connect_commands + add_instance_command))
1259+
logger.info(
1260+
f"Adding instance {instance_address}/{instance_unit_label} to {self.cluster_name=}"
1261+
f"with recovery {method=}"
1262+
)
1263+
self._run_mysqlsh_script("\n".join(connect_commands + add_instance_command))
12581264

1259-
break
1260-
except MySQLClientError as e:
1261-
if recovery_method == "clone":
1262-
logger.exception(
1263-
f"Failed to add instance {instance_address} to cluster {self.cluster_name} on {self.instance_address}",
1264-
exc_info=e,
1265-
)
1266-
self._release_lock(
1267-
from_instance or self.instance_address,
1268-
instance_unit_label,
1269-
UNIT_ADD_LOCKNAME,
1270-
)
1271-
raise MySQLAddInstanceToClusterError(e.message)
1272-
1273-
logger.debug(
1274-
f"Failed to add instance {instance_address} to cluster {self.cluster_name} with recovery method 'auto'. Trying method 'clone'"
1265+
except MySQLClientError:
1266+
if method == "clone":
1267+
logger.exception(
1268+
f"Failed to add {instance_address=} to {self.cluster_name=} on {self.instance_address=}",
12751269
)
1276-
self._release_lock(
1277-
from_instance or self.instance_address, instance_unit_label, UNIT_ADD_LOCKNAME
1278-
)
1270+
raise MySQLAddInstanceToClusterError
1271+
1272+
logger.debug(
1273+
f"Cannot add {instance_address=} to {self.cluster_name=} with recovery {method=}. Trying method 'clone'"
1274+
)
1275+
self.add_instance_to_cluster(
1276+
instance_address, instance_unit_label, from_instance, method="clone"
1277+
)
1278+
finally:
1279+
# always release the lock
1280+
self._release_lock(
1281+
from_instance or self.instance_address, instance_unit_label, UNIT_ADD_LOCKNAME
1282+
)
12791283

12801284
def is_instance_configured_for_innodb(
12811285
self, instance_address: str, instance_unit_label: str
@@ -1398,6 +1402,11 @@ def is_instance_in_cluster(self, unit_label: str) -> bool:
13981402
)
13991403
return False
14001404

1405+
@retry(
1406+
wait=wait_fixed(2),
1407+
stop=stop_after_attempt(3),
1408+
retry=retry_if_exception_type(TimeoutError),
1409+
)
14011410
def get_cluster_status(self, extended: Optional[bool] = False) -> Optional[dict]:
14021411
"""Get the cluster status.
14031412
@@ -2505,13 +2514,19 @@ def get_pid_of_port_3306(self) -> Optional[str]:
25052514
except MySQLExecError:
25062515
return None
25072516

2508-
def flush_mysql_logs(self, logs_type: MySQLTextLogs) -> None:
2517+
def flush_mysql_logs(self, logs_type: Union[MySQLTextLogs, list[MySQLTextLogs]]) -> None:
25092518
"""Flushes the specified logs_type logs."""
2510-
flush_logs_commands = (
2519+
flush_logs_commands = [
25112520
f"shell.connect('{self.server_config_user}:{self.server_config_password}@{self.instance_address}')",
25122521
'session.run_sql("SET sql_log_bin = 0")',
2513-
f'session.run_sql("FLUSH {logs_type.value}")',
2514-
)
2522+
]
2523+
2524+
if type(logs_type) is list:
2525+
flush_logs_commands.extend(
2526+
[f"session.run_sql('FLUSH {log.value}')" for log in logs_type]
2527+
)
2528+
else:
2529+
flush_logs_commands.append(f'session.run_sql("FLUSH {logs_type.value}")') # type: ignore
25152530

25162531
try:
25172532
self._run_mysqlsh_script("\n".join(flush_logs_commands))

poetry.lock

Lines changed: 7 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ authors = []
1010

1111
[tool.poetry.dependencies]
1212
python = "^3.10"
13-
ops = "^2.5.0"
13+
ops = "^2.7.0"
1414
lightkube = "^0.14.0"
1515
tenacity = "^8.2.2"
1616
boto3 = "^1.28.22"

src/charm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,7 @@ def _open_ports(self) -> None:
481481
"""
482482
if ops.JujuVersion.from_environ().supports_open_port_on_k8s:
483483
try:
484-
self.unit.open_port("tcp", 3306)
485-
self.unit.open_port("tcp", 33060)
484+
self.unit.set_ports(3306, 33060)
486485
except ops.ModelError:
487486
logger.exception("failed to open port")
488487

src/k8s_helpers.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55

66
import logging
77
import socket
8+
import typing
89
from typing import Dict, List, Optional, Tuple
910

10-
from lightkube import Client
11+
from lightkube.core.client import Client
1112
from lightkube.core.exceptions import ApiError
1213
from lightkube.models.core_v1 import ServicePort, ServiceSpec
1314
from lightkube.models.meta_v1 import ObjectMeta
1415
from lightkube.resources.apps_v1 import StatefulSet
1516
from lightkube.resources.core_v1 import Node, Pod, Service
16-
from ops.charm import CharmBase
1717
from tenacity import retry, stop_after_attempt, wait_fixed
1818

1919
from utils import any_memory_to_bytes
@@ -24,6 +24,9 @@
2424
logging.getLogger("httpcore").setLevel(logging.ERROR)
2525
logging.getLogger("httpx").setLevel(logging.ERROR)
2626

27+
if typing.TYPE_CHECKING:
28+
from charm import MySQLOperatorCharm
29+
2730

2831
class KubernetesClientError(Exception):
2932
"""Exception raised when client can't execute."""
@@ -32,7 +35,7 @@ class KubernetesClientError(Exception):
3235
class KubernetesHelpers:
3336
"""Kubernetes helpers for service exposure."""
3437

35-
def __init__(self, charm: CharmBase):
38+
def __init__(self, charm: "MySQLOperatorCharm"):
3639
"""Initialize Kubernetes helpers.
3740
3841
Args:
@@ -42,7 +45,7 @@ def __init__(self, charm: CharmBase):
4245
self.namespace = charm.model.name
4346
self.app_name = charm.model.app.name
4447
self.cluster_name = charm.app_peer_data.get("cluster-name")
45-
self.client = Client()
48+
self.client = Client() # type: ignore
4649

4750
def create_endpoint_services(self, roles: List[str]) -> None:
4851
"""Create kubernetes service for endpoints.

src/mysql_k8s_helpers.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
MySQLStopMySQLDError,
2121
)
2222
from ops.model import Container
23-
from ops.pebble import ChangeError, ExecError
23+
from ops.pebble import ChangeError, ExecError, PathError
2424
from tenacity import (
2525
retry,
2626
retry_if_exception_type,
@@ -195,14 +195,13 @@ def fix_data_dir(self, container: Container) -> None:
195195
logger.debug(f"Changing ownership to {MYSQL_SYSTEM_USER}:{MYSQL_SYSTEM_GROUP}")
196196
try:
197197
container.exec(
198-
f"chown -R {MYSQL_SYSTEM_USER}:{MYSQL_SYSTEM_GROUP} {MYSQL_DATA_DIR}".split(
199-
" "
200-
)
198+
["chown", "-R", f"{MYSQL_SYSTEM_USER}:{MYSQL_SYSTEM_GROUP}", MYSQL_DATA_DIR]
201199
)
202200
except ExecError as e:
203201
logger.error(f"Exited with code {e.exit_code}. Stderr:\n{e.stderr}")
204202
raise MySQLInitialiseMySQLDError(e.stderr or "")
205203

204+
@retry(reraise=True, stop=stop_after_delay(30), wait=wait_fixed(5))
206205
def initialise_mysqld(self) -> None:
207206
"""Execute instance first run.
208207
@@ -217,13 +216,11 @@ def initialise_mysqld(self) -> None:
217216
user=MYSQL_SYSTEM_USER,
218217
group=MYSQL_SYSTEM_GROUP,
219218
)
220-
process.wait_output()
221-
except ExecError as e:
222-
logger.error("Exited with code %d. Stderr:", e.exit_code)
223-
if e.stderr:
224-
for line in e.stderr.splitlines():
225-
logger.error(" %s", line)
226-
raise MySQLInitialiseMySQLDError(e.stderr if e.stderr else "")
219+
process.wait()
220+
except (ExecError, ChangeError, PathError, TimeoutError):
221+
logger.exception("Failed to initialise MySQL data directory")
222+
self.reset_data_dir()
223+
raise MySQLInitialiseMySQLDError
227224

228225
@retry(reraise=True, stop=stop_after_delay(30), wait=wait_fixed(5))
229226
def wait_until_mysql_connection(self, check_port: bool = True) -> None:
@@ -756,6 +753,14 @@ def remove_file(self, path: str) -> None:
756753
if self.container.exists(path):
757754
self.container.remove_path(path)
758755

756+
def reset_data_dir(self) -> None:
757+
"""Remove all files from the data directory."""
758+
content = self.container.list_files(MYSQL_DATA_DIR)
759+
content_set = {item.name for item in content}
760+
logger.debug("Resetting MySQL data directory.")
761+
for item in content_set:
762+
self.container.remove_path(f"{MYSQL_DATA_DIR}/{item}", recursive=True)
763+
759764
def check_if_mysqld_process_stopped(self) -> bool:
760765
"""Checks if the mysqld process is stopped on the container."""
761766
command = ["ps", "-eo", "comm,stat"]

src/rotate_mysql_logs.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import typing
88

9-
from charms.mysql.v0.mysql import MySQLExecError, MySQLTextLogs
9+
from charms.mysql.v0.mysql import MySQLClientError, MySQLExecError, MySQLTextLogs
1010
from ops.charm import CharmEvents
1111
from ops.framework import EventBase, EventSource, Object
1212

@@ -48,10 +48,8 @@ def _rotate_mysql_logs(self, _) -> None:
4848

4949
try:
5050
self.charm._mysql._execute_commands(["logrotate", "-f", LOG_ROTATE_CONFIG_FILE])
51+
self.charm._mysql.flush_mysql_logs(list(MySQLTextLogs))
5152
except MySQLExecError:
52-
logger.exception("Failed to rotate mysql logs")
53-
return
54-
55-
self.charm._mysql.flush_mysql_logs(MySQLTextLogs.ERROR)
56-
self.charm._mysql.flush_mysql_logs(MySQLTextLogs.GENERAL)
57-
self.charm._mysql.flush_mysql_logs(MySQLTextLogs.SLOW)
53+
logger.warning("Failed to rotate MySQL logs")
54+
except MySQLClientError:
55+
logger.warning("Failed to flush MySQL logs")

tests/integration/test_charm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ async def test_custom_variables(ops_test: OpsTest) -> None:
325325
application = ops_test.model.applications[APP_NAME]
326326

327327
custom_vars = {}
328-
custom_vars["max_connections"] = 20
328+
custom_vars["max_connections"] = 100
329329
custom_vars["innodb_buffer_pool_size"] = 20971520
330330
custom_vars["innodb_buffer_pool_chunk_size"] = 1048576
331331
custom_vars["group_replication_message_cache_size"] = 134217728

0 commit comments

Comments
 (0)