Skip to content

Commit 014365a

Browse files
Retry if MySQL Server is unreachable (#190)
Fixes #86 ## Context These are the situations the charm should handle without raising an uncaught exception: - `mysqlrouter --bootstrap` fails because server does not have quorum - `mysqlrouter --bootstrap` fails because server is unreachable (error code [2003](https://dev.mysql.com/doc/mysql-errors/8.0/en/client-error-reference.html#error_cr_conn_host_error)) - mysqlsh `shell.connect()` fails because server is unreachable (error code [2003](https://dev.mysql.com/doc/mysql-errors/8.0/en/client-error-reference.html#error_cr_conn_host_error)) (mysqlsh commands seem to not fail even if server does not have quorum) ## Solution Catch exceptions in the aforementioned situations & set waiting status. Retry connections on the next Juju event ## Alternatives considered We decided not to retry within the Juju event so that - (primary reason) the user is not blocked from removing the unit(s) - if the endpoint from mysql server changes, we need to get a new event to get the updated endpoint A retry within the Juju event with a short timeout was considered. However, during initial deployment ~15 events are currently fired (so a 1 min timeout could block the charm for 15 minutes). It would be possible to mitigate that by storing information about the last retry—but the complexity was not worth it. Discussion about Juju limitation that we may not get another event until update status: https://matrix.to/#/!xdClnUGkurzjxqiQcN:ubuntu.com/$_SXnGJWUQCeFGu9vpBxD5GJ5hgMMAHchOmWBPiqjDFY?via=ubuntu.com&via=matrix.org&via=cutefunny.art ## Implementation/refactoring - Catch mysql shell `DBError` exceptions, serialize them to JSON, and raise corresponding exception in charm code. This was done to move complexity (i.e. exception handling) from mysqlsh python code to charm code - Move mysqlsh python code from strings to Jinja templates (for better syntax highlighting/analysis & code review/diffs) - Use JSON files to transfer data from mysqlsh python code to charm code instead of printing & parsing stdout
1 parent d89b0f8 commit 014365a

15 files changed

+306
-93
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.py.jinja linguist-language=Python

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,9 @@ select = ["E", "W", "F", "C", "N", "R", "D", "H"]
9898
# Ignore D403 First word of the first line should be properly capitalized (false positive on "MySQL")
9999
# Ignore N818 Exception should be named with an Error suffix
100100
# Ignore D102 Missing docstring in public method (pydocstyle doesn't look for docstrings in super class
101+
# Ignore W505 So that strings in comments aren't split across lines
101102
# https://github.com/PyCQA/pydocstyle/issues/309) TODO: add pylint check? https://github.com/PyCQA/pydocstyle/issues/309#issuecomment-1284142716
102-
ignore = ["W503", "E501", "D107", "D105", "D415", "D403", "N818", "D102"]
103+
ignore = ["W503", "E501", "D107", "D105", "D415", "D403", "N818", "D102", "W505"]
103104
# D100, D101, D102, D103: Ignore missing docstrings in tests
104105
per-file-ignores = ["tests/*:D100,D101,D102,D103,D104"]
105106
docstring-convention = "google"

src/abstract_charm.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import machine_upgrade
1818
import relations.database_provides
1919
import relations.database_requires
20+
import server_exceptions
2021
import upgrade
2122
import workload
2223

@@ -232,37 +233,43 @@ def reconcile(self, event=None) -> None: # noqa: C901
232233
f"{self._database_requires.is_relation_breaking(event)=}, "
233234
f"{self._upgrade.in_progress=}"
234235
)
235-
if self._unit_lifecycle.authorized_leader:
236-
if self._database_requires.is_relation_breaking(event):
237-
if self._upgrade.in_progress:
238-
logger.warning(
239-
"Modifying relations during an upgrade is not supported. The charm may be in a broken, unrecoverable state. Re-deploy the charm"
236+
try:
237+
if self._unit_lifecycle.authorized_leader:
238+
if self._database_requires.is_relation_breaking(event):
239+
if self._upgrade.in_progress:
240+
logger.warning(
241+
"Modifying relations during an upgrade is not supported. The charm may be in a broken, unrecoverable state. Re-deploy the charm"
242+
)
243+
self._database_provides.delete_all_databags()
244+
elif (
245+
not self._upgrade.in_progress
246+
and isinstance(workload_, workload.AuthenticatedWorkload)
247+
and workload_.container_ready
248+
):
249+
self._database_provides.reconcile_users(
250+
event=event,
251+
router_read_write_endpoint=self._read_write_endpoint,
252+
router_read_only_endpoint=self._read_only_endpoint,
253+
shell=workload_.shell,
240254
)
241-
self._database_provides.delete_all_databags()
242-
elif (
243-
not self._upgrade.in_progress
244-
and isinstance(workload_, workload.AuthenticatedWorkload)
245-
and workload_.container_ready
246-
):
247-
self._database_provides.reconcile_users(
248-
event=event,
249-
router_read_write_endpoint=self._read_write_endpoint,
250-
router_read_only_endpoint=self._read_only_endpoint,
251-
shell=workload_.shell,
252-
)
253-
if isinstance(workload_, workload.AuthenticatedWorkload) and workload_.container_ready:
254-
workload_.enable(tls=self._tls_certificate_saved, unit_name=self.unit.name)
255-
elif workload_.container_ready:
256-
workload_.disable()
257-
# Empty waiting status means we're waiting for database requires relation before starting
258-
# workload
259-
if not workload_.status or workload_.status == ops.WaitingStatus():
260-
self._upgrade.unit_state = "healthy"
261-
if self._unit_lifecycle.authorized_leader:
262-
self._upgrade.reconcile_partition()
263-
if not self._upgrade.in_progress:
264-
self._upgrade.set_versions_in_app_databag()
265-
self.set_status(event=event)
255+
if isinstance(workload_, workload.AuthenticatedWorkload) and workload_.container_ready:
256+
workload_.enable(tls=self._tls_certificate_saved, unit_name=self.unit.name)
257+
elif workload_.container_ready:
258+
workload_.disable()
259+
# Empty waiting status means we're waiting for database requires relation before
260+
# starting workload
261+
if not workload_.status or workload_.status == ops.WaitingStatus():
262+
self._upgrade.unit_state = "healthy"
263+
if self._unit_lifecycle.authorized_leader:
264+
self._upgrade.reconcile_partition()
265+
if not self._upgrade.in_progress:
266+
self._upgrade.set_versions_in_app_databag()
267+
self.set_status(event=event)
268+
except server_exceptions.Error as e:
269+
# If not for `unit=False`, another `server_exceptions.Error` could be thrown here
270+
self.set_status(event=event, unit=False)
271+
self.unit.status = e.status
272+
logger.debug(f"Set unit status to {self.unit.status}")
266273

267274
def _on_resume_upgrade_action(self, event: ops.ActionEvent) -> None:
268275
if not self._unit_lifecycle.authorized_leader:

src/container.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ def relative_to_container(self) -> pathlib.PurePosixPath:
2222
Only differs from `self` on machine charm
2323
"""
2424

25+
@abc.abstractmethod
26+
def open(self, mode="r") -> typing.TextIO:
27+
"""Open the file in read text mode."""
28+
assert mode == "r"
29+
2530
@abc.abstractmethod
2631
def read_text(self) -> str:
2732
"""Open the file in text mode, read it, and close the file."""

src/mysql_shell.py renamed to src/mysql_shell/__init__.py

Lines changed: 96 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,18 @@
99
import dataclasses
1010
import json
1111
import logging
12+
import pathlib
1213
import secrets
1314
import string
1415
import typing
1516

17+
import jinja2
18+
1619
import container
20+
import server_exceptions
21+
22+
if typing.TYPE_CHECKING:
23+
import relations.database_requires
1724

1825
logger = logging.getLogger(__name__)
1926

@@ -29,33 +36,57 @@ class RouterUserInformation:
2936
router_id: str
3037

3138

39+
class ShellDBError(Exception):
40+
"""`mysqlsh.DBError` raised while executing MySQL Shell script
41+
42+
MySQL Shell runs Python code in a separate process from the charm Python code.
43+
The `mysqlsh.DBError` was caught by the shell code, serialized to JSON, and de-serialized to
44+
this exception.
45+
"""
46+
47+
def __init__(self, *, message: str, code: int, traceback_message: str):
48+
super().__init__(message)
49+
self.code = code
50+
self.traceback_message = traceback_message
51+
52+
3253
# TODO python3.10 min version: Add `(kw_only=True)`
3354
@dataclasses.dataclass
3455
class Shell:
3556
"""MySQL Shell connected to MySQL cluster"""
3657

3758
_container: container.Container
38-
username: str
39-
_password: str
40-
_host: str
41-
_port: str
59+
_connection_info: "relations.database_requires.CompleteConnectionInformation"
60+
61+
@property
62+
def username(self):
63+
return self._connection_info.username
4264

4365
# TODO python3.10 min version: Use `list` instead of `typing.List`
44-
def _run_commands(self, commands: typing.List[str]) -> str:
45-
"""Connect to MySQL cluster and run commands."""
66+
def _run_code(self, code: str) -> None:
67+
"""Connect to MySQL cluster and run Python code."""
68+
template = _jinja_env.get_template("try_except_wrapper.py.jinja")
69+
error_file = self._container.path("/tmp/mysqlsh_error.json")
70+
71+
def render(connection_info: "relations.database_requires.ConnectionInformation"):
72+
return template.render(
73+
username=connection_info.username,
74+
password=connection_info.password,
75+
host=connection_info.host,
76+
port=connection_info.port,
77+
code=code,
78+
error_filepath=error_file.relative_to_container,
79+
)
80+
4681
# Redact password from log
47-
logged_commands = commands.copy()
48-
logged_commands.insert(
49-
0, f"shell.connect('{self.username}:***@{self._host}:{self._port}')"
50-
)
82+
logged_script = render(self._connection_info.redacted)
5183

52-
commands.insert(
53-
0, f"shell.connect('{self.username}:{self._password}@{self._host}:{self._port}')"
54-
)
55-
temporary_script_file = self._container.path("/tmp/script.py")
56-
temporary_script_file.write_text("\n".join(commands))
84+
script = render(self._connection_info)
85+
temporary_script_file = self._container.path("/tmp/mysqlsh_script.py")
86+
error_file = self._container.path("/tmp/mysqlsh_error.json")
87+
temporary_script_file.write_text(script)
5788
try:
58-
output = self._container.run_mysql_shell(
89+
self._container.run_mysql_shell(
5990
[
6091
"--no-wizard",
6192
"--python",
@@ -64,21 +95,34 @@ def _run_commands(self, commands: typing.List[str]) -> str:
6495
]
6596
)
6697
except container.CalledProcessError as e:
67-
logger.exception(f"Failed to run {logged_commands=}\nstderr:\n{e.stderr}\n")
98+
logger.exception(
99+
f"Failed to run MySQL Shell script:\n{logged_script}\n\nstderr:\n{e.stderr}\n"
100+
)
68101
raise
69102
finally:
70103
temporary_script_file.unlink()
71-
return output
104+
with error_file.open("r") as file:
105+
exception = json.load(file)
106+
error_file.unlink()
107+
try:
108+
if exception:
109+
raise ShellDBError(**exception)
110+
except ShellDBError as e:
111+
if e.code == 2003:
112+
logger.exception(server_exceptions.ConnectionError.MESSAGE)
113+
raise server_exceptions.ConnectionError
114+
else:
115+
logger.exception(
116+
f"Failed to run MySQL Shell script:\n{logged_script}\n\nMySQL client error {e.code}\nMySQL Shell traceback:\n{e.traceback_message}\n"
117+
)
118+
raise
72119

73120
# TODO python3.10 min version: Use `list` instead of `typing.List`
74121
def _run_sql(self, sql_statements: typing.List[str]) -> None:
75122
"""Connect to MySQL cluster and execute SQL."""
76-
commands = []
77-
for statement in sql_statements:
78-
# Escape double quote (") characters in statement
79-
statement = statement.replace('"', r"\"")
80-
commands.append('session.run_sql("' + statement + '")')
81-
self._run_commands(commands)
123+
self._run_code(
124+
_jinja_env.get_template("run_sql.py.jinja").render(statements=sql_statements)
125+
)
82126

83127
@staticmethod
84128
def _generate_password() -> str:
@@ -135,14 +179,17 @@ def get_mysql_router_user_for_unit(
135179
again.
136180
"""
137181
logger.debug(f"Getting MySQL Router user for {unit_name=}")
138-
rows = json.loads(
139-
self._run_commands(
140-
[
141-
f"result = session.run_sql(\"SELECT USER, ATTRIBUTE->>'$.router_id' FROM INFORMATION_SCHEMA.USER_ATTRIBUTES WHERE ATTRIBUTE->'$.created_by_user'='{self.username}' AND ATTRIBUTE->'$.created_by_juju_unit'='{unit_name}'\")",
142-
"print(result.fetch_all())",
143-
]
182+
output_file = self._container.path("/tmp/mysqlsh_output.json")
183+
self._run_code(
184+
_jinja_env.get_template("get_mysql_router_user_for_unit.py.jinja").render(
185+
username=self.username,
186+
unit_name=unit_name,
187+
output_filepath=output_file.relative_to_container,
144188
)
145189
)
190+
with output_file.open("r") as file:
191+
rows = json.load(file)
192+
output_file.unlink()
146193
if not rows:
147194
logger.debug(f"No MySQL Router user found for {unit_name=}")
148195
return
@@ -159,8 +206,10 @@ def remove_router_from_cluster_metadata(self, router_id: str) -> None:
159206
metadata already exists for the router ID.
160207
"""
161208
logger.debug(f"Removing {router_id=} from cluster metadata")
162-
self._run_commands(
163-
["cluster = dba.get_cluster()", f'cluster.remove_router_metadata("{router_id}")']
209+
self._run_code(
210+
_jinja_env.get_template("remove_router_from_cluster_metadata.py.jinja").render(
211+
router_id=router_id
212+
)
164213
)
165214
logger.debug(f"Removed {router_id=} from cluster metadata")
166215

@@ -177,12 +226,24 @@ def delete_user(self, username: str, *, must_exist=True) -> None:
177226
def is_router_in_cluster_set(self, router_id: str) -> bool:
178227
"""Check if MySQL Router is part of InnoDB ClusterSet."""
179228
logger.debug(f"Checking if {router_id=} in cluster set")
180-
output = json.loads(
181-
self._run_commands(
182-
["cluster_set = dba.get_cluster_set()", "print(cluster_set.list_routers())"]
229+
output_file = self._container.path("/tmp/mysqlsh_output.json")
230+
self._run_code(
231+
_jinja_env.get_template("get_routers_in_cluster_set.py.jinja").render(
232+
output_filepath=output_file.relative_to_container
183233
)
184234
)
235+
with output_file.open("r") as file:
236+
output = json.load(file)
237+
output_file.unlink()
185238
cluster_set_router_ids = output["routers"].keys()
186239
logger.debug(f"{cluster_set_router_ids=}")
187240
logger.debug(f"Checked if {router_id in cluster_set_router_ids=}")
188241
return router_id in cluster_set_router_ids
242+
243+
244+
_jinja_env = jinja2.Environment(
245+
autoescape=False,
246+
trim_blocks=True,
247+
loader=jinja2.FileSystemLoader(pathlib.Path(__file__).parent / "templates"),
248+
undefined=jinja2.StrictUndefined,
249+
)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import json
2+
3+
result = session.run_sql(
4+
"SELECT USER, ATTRIBUTE->>'$.router_id' FROM INFORMATION_SCHEMA.USER_ATTRIBUTES WHERE ATTRIBUTE->'$.created_by_user'='{{ username }}' AND ATTRIBUTE->'$.created_by_juju_unit'='{{ unit_name }}'"
5+
)
6+
rows = result.fetch_all()
7+
# mysqlsh objects are weird—they quack (i.e. duck typing) like standard Python objects (e.g. list,
8+
# dict), but do not serialize to JSON correctly.
9+
# Cast to str & load from JSON str before serializing
10+
rows = json.loads(str(rows))
11+
with open("{{ output_filepath }}", "w") as file:
12+
json.dump(rows, file)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import json
2+
3+
cluster_set = dba.get_cluster_set()
4+
routers = cluster_set.list_routers()
5+
# mysqlsh objects are weird—they quack (i.e. duck typing) like standard Python objects (e.g. list,
6+
# dict), but do not serialize to JSON correctly.
7+
# Cast to str & load from JSON str before serializing
8+
routers = json.loads(str(routers))
9+
with open("{{ output_filepath }}", "w") as file:
10+
json.dump(routers, file)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
cluster = dba.get_cluster()
2+
cluster.remove_router_metadata("{{ router_id }}")
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{% for statement in statements %}
2+
session.run_sql("{{ statement|replace('"', '\\"') }}")
3+
{% endfor %}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import json
2+
import mysqlsh
3+
import traceback
4+
5+
try:
6+
shell.connect("{{ username }}:{{ password }}@{{ host }}:{{ port }}")
7+
{{ code|indent(width=4) }}
8+
except mysqlsh.DBError as exception:
9+
error = {
10+
"message": str(exception),
11+
"code": exception.code,
12+
"traceback_message": "".join(traceback.format_exception(exception)),
13+
}
14+
else:
15+
error = None
16+
with open("{{ error_filepath }}", "w") as file:
17+
json.dump(error, file)

0 commit comments

Comments
 (0)