Skip to content

Commit 55c5e91

Browse files
Add in-place upgrades (#88)
Derived from canonical/mysql-router-k8s-operator#138
1 parent c10f6f4 commit 55c5e91

23 files changed

+674
-115
lines changed

actions.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright 2023 Canonical Ltd.
2+
# See LICENSE file for licensing details.
3+
4+
resume-upgrade:
5+
description: Upgrade remaining units (after you manually verified that upgraded units are healthy).
6+
force-upgrade:
7+
description: Force upgrade of this unit.

charm_version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1

charmcraft.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,6 @@ parts:
2828
exit 1
2929
fi
3030
charm-entrypoint: src/machine_charm.py
31+
prime:
32+
- charm_version
33+
- workload_version

metadata.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ requires:
3636
interface: juju-info
3737
scope: container
3838
peers:
39+
upgrade-version-a:
40+
# Relation versioning scheme:
41+
# DA056 - Upgrading in-place upgrade protocol
42+
# https://docs.google.com/document/d/1H7qy5SAwLiCOKO9xMQJbbQP5_-jGV6Lhi-mJOk4gZ08/edit
43+
interface: upgrade
3944
# TODO TLS VM: re-enable peer relation
4045
# mysql-router-peers:
4146
# interface: mysql_router_peers

poetry.lock

Lines changed: 13 additions & 22 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ authors = []
1212
python = "^3.8.1" # ^3.8.1 required by flake8
1313
ops = "^2.6.0"
1414
tenacity = "^8.2.3"
15+
poetry-core = "^1.7.0"
1516
jinja2 = "^3.1.2"
1617

1718
[tool.poetry.group.charm-libs.dependencies]

src/abstract_charm.py

Lines changed: 109 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
import container
1515
import lifecycle
1616
import logrotate
17+
import machine_upgrade
1718
import relations.database_provides
1819
import relations.database_requires
20+
import upgrade
1921
import workload
2022

2123
logger = logging.getLogger(__name__)
@@ -35,11 +37,26 @@ def __init__(self, *args) -> None:
3537
self._authenticated_workload_type = workload.AuthenticatedWorkload
3638
self._database_requires = relations.database_requires.RelationEndpoint(self)
3739
self._database_provides = relations.database_provides.RelationEndpoint(self)
38-
self.framework.observe(self.on.update_status, self.reconcile_database_relations)
39-
# Set status on first start if no relations active
40-
self.framework.observe(self.on.start, self.reconcile_database_relations)
40+
self.framework.observe(self.on.update_status, self.reconcile)
41+
self.framework.observe(
42+
self.on[upgrade.PEER_RELATION_ENDPOINT_NAME].relation_changed, self.reconcile
43+
)
44+
self.framework.observe(
45+
self.on[upgrade.RESUME_ACTION_NAME].action, self._on_resume_upgrade_action
46+
)
47+
# (For Kubernetes) Reset partition after scale down
48+
self.framework.observe(
49+
self.on[upgrade.PEER_RELATION_ENDPOINT_NAME].relation_departed, self.reconcile
50+
)
51+
# Handle upgrade & set status on first start if no relations active
52+
self.framework.observe(self.on.start, self.reconcile)
4153
# Update app status
42-
self.framework.observe(self.on.leader_elected, self.reconcile_database_relations)
54+
self.framework.observe(self.on.leader_elected, self.reconcile)
55+
# Set versions in upgrade peer relation app databag
56+
self.framework.observe(
57+
self.on[upgrade.PEER_RELATION_ENDPOINT_NAME].relation_created,
58+
self._upgrade_relation_created,
59+
)
4360

4461
@property
4562
@abc.abstractmethod
@@ -60,6 +77,11 @@ def _tls_certificate_saved(self) -> bool:
6077
def _container(self) -> container.Container:
6178
"""Workload container (snap or ROCK)"""
6279

80+
@property
81+
@abc.abstractmethod
82+
def _upgrade(self) -> typing.Optional[upgrade.Upgrade]:
83+
pass
84+
6385
@property
6486
@abc.abstractmethod
6587
def _logrotate(self) -> logrotate.LogRotate:
@@ -95,8 +117,8 @@ def _prioritize_statuses(statuses: typing.List[ops.StatusBase]) -> ops.StatusBas
95117
"""
96118
status_priority = (
97119
ops.BlockedStatus,
98-
ops.WaitingStatus,
99120
ops.MaintenanceStatus,
121+
ops.WaitingStatus,
100122
# Catch any unknown status type
101123
ops.StatusBase,
102124
)
@@ -108,6 +130,11 @@ def _prioritize_statuses(statuses: typing.List[ops.StatusBase]) -> ops.StatusBas
108130

109131
def _determine_app_status(self, *, event) -> ops.StatusBase:
110132
"""Report app status."""
133+
if self._upgrade and (upgrade_status := self._upgrade.app_status):
134+
# Upgrade status should take priority over relation status—even if the status level is
135+
# normally lower priority.
136+
# (Relations should not be modified during upgrade.)
137+
return upgrade_status
111138
statuses = []
112139
for endpoint in (self._database_requires, self._database_provides):
113140
if status := endpoint.get_status(event):
@@ -117,25 +144,28 @@ def _determine_app_status(self, *, event) -> ops.StatusBase:
117144
def _determine_unit_status(self, *, event) -> ops.StatusBase:
118145
"""Report unit status."""
119146
statuses = []
120-
workload_ = self.get_workload(event=event)
121-
statuses.append(workload_.get_status(event))
147+
workload_status = self.get_workload(event=event).status
148+
if self._upgrade:
149+
statuses.append(self._upgrade.get_unit_juju_status(workload_status=workload_status))
150+
statuses.append(workload_status)
122151
return self._prioritize_statuses(statuses)
123152

124-
def set_status(self, *, event) -> None:
153+
def set_status(self, *, event, app=True, unit=True) -> None:
125154
"""Set charm status."""
126-
if self._unit_lifecycle.authorized_leader:
155+
if app and self._unit_lifecycle.authorized_leader:
127156
self.app.status = self._determine_app_status(event=event)
128157
logger.debug(f"Set app status to {self.app.status}")
129-
self.unit.status = self._determine_unit_status(event=event)
130-
logger.debug(f"Set unit status to {self.unit.status}")
158+
if unit:
159+
self.unit.status = self._determine_unit_status(event=event)
160+
logger.debug(f"Set unit status to {self.unit.status}")
131161

132162
def wait_until_mysql_router_ready(self) -> None:
133163
"""Wait until a connection to MySQL Router is possible.
134164
135165
Retry every 5 seconds for up to 30 seconds.
136166
"""
137167
logger.debug("Waiting until MySQL Router is ready")
138-
self.unit.status = ops.WaitingStatus("MySQL Router starting")
168+
self.unit.status = ops.MaintenanceStatus("MySQL Router starting")
139169
try:
140170
for attempt in tenacity.Retrying(
141171
reraise=True,
@@ -156,21 +186,63 @@ def wait_until_mysql_router_ready(self) -> None:
156186
# Handlers
157187
# =======================
158188

159-
def reconcile_database_relations(self, event=None) -> None:
160-
"""Handle database requires/provides events."""
189+
def _upgrade_relation_created(self, _) -> None:
190+
if self._unit_lifecycle.authorized_leader:
191+
# `self._upgrade.is_compatible` should return `True` during first charm
192+
# installation/setup
193+
self._upgrade.set_versions_in_app_databag()
194+
195+
def reconcile(self, event=None) -> None: # noqa: C901
196+
"""Handle most events."""
197+
if not self._upgrade:
198+
logger.debug("Peer relation not available")
199+
return
200+
if not self._upgrade.versions_set:
201+
logger.debug("Peer relation not ready")
202+
return
161203
workload_ = self.get_workload(event=event)
204+
if self._upgrade.unit_state == "restarting": # Kubernetes only
205+
if not self._upgrade.is_compatible:
206+
logger.info(
207+
"Upgrade incompatible. If you accept potential *data loss* and *downtime*, you can continue with `resume-upgrade force=true`"
208+
)
209+
self.unit.status = ops.BlockedStatus(
210+
"Upgrade incompatible. Rollback to previous revision with `juju refresh`"
211+
)
212+
self.set_status(event=event, unit=False)
213+
return
214+
elif isinstance(self._upgrade, machine_upgrade.Upgrade): # Machines only
215+
if not self._upgrade.is_compatible:
216+
self.set_status(event=event)
217+
return
218+
if self._upgrade.unit_state == "outdated":
219+
if self._upgrade.authorized:
220+
self._upgrade.upgrade_unit(
221+
workload_=workload_, tls=self._tls_certificate_saved
222+
)
223+
else:
224+
self.set_status(event=event)
225+
logger.debug("Waiting to upgrade")
226+
return
162227
logger.debug(
163228
"State of reconcile "
164229
f"{self._unit_lifecycle.authorized_leader=}, "
165230
f"{isinstance(workload_, workload.AuthenticatedWorkload)=}, "
166231
f"{workload_.container_ready=}, "
167-
f"{self._database_requires.is_relation_breaking(event)=}"
232+
f"{self._database_requires.is_relation_breaking(event)=}, "
233+
f"{self._upgrade.in_progress=}"
168234
)
169235
if self._unit_lifecycle.authorized_leader:
170236
if self._database_requires.is_relation_breaking(event):
237+
if self._upgrade.in_progress:
238+
logger.warning(
239+
"Modifying relations during an upgrade is not supported. The charm may be in a broken, unrecoverable state. Re-deploy the charm"
240+
)
171241
self._database_provides.delete_all_databags()
172242
elif (
173-
isinstance(workload_, workload.AuthenticatedWorkload) and workload_.container_ready
243+
not self._upgrade.in_progress
244+
and isinstance(workload_, workload.AuthenticatedWorkload)
245+
and workload_.container_ready
174246
):
175247
self._database_provides.reconcile_users(
176248
event=event,
@@ -182,4 +254,25 @@ def reconcile_database_relations(self, event=None) -> None:
182254
workload_.enable(tls=self._tls_certificate_saved, unit_name=self.unit.name)
183255
elif workload_.container_ready:
184256
workload_.disable()
257+
# Empty waiting status means we're waiting for database requires relation before starting
258+
# workload
259+
if not workload_.status or workload_.status == ops.WaitingStatus():
260+
self._upgrade.unit_state = "healthy"
261+
if self._unit_lifecycle.authorized_leader:
262+
self._upgrade.reconcile_partition()
263+
if not self._upgrade.in_progress:
264+
self._upgrade.set_versions_in_app_databag()
185265
self.set_status(event=event)
266+
267+
def _on_resume_upgrade_action(self, event: ops.ActionEvent) -> None:
268+
if not self._unit_lifecycle.authorized_leader:
269+
message = f"Must run action on leader unit. (e.g. `juju run {self.app.name}/leader {upgrade.RESUME_ACTION_NAME}`)"
270+
logger.debug(f"Resume upgrade event failed: {message}")
271+
event.fail(message)
272+
return
273+
if not self._upgrade or not self._upgrade.in_progress:
274+
message = "No upgrade in progress"
275+
logger.debug(f"Resume upgrade event failed: {message}")
276+
event.fail(message)
277+
return
278+
self._upgrade.reconcile_partition(action_event=event)

0 commit comments

Comments
 (0)