Skip to content

Commit a4e98ce

Browse files
authored
DPE-4850 Fix intermitent issue on aks deployments (#458)
* fix for aks setup issue * avoid using juju for cluster initialization state * allow catch all on non-critical/retried handler * porting changes from vm * for storage re usage, cluster metadata exists * improved comments and messaging
1 parent 99f131a commit a4e98ce

File tree

12 files changed

+173
-529
lines changed

12 files changed

+173
-529
lines changed

lib/charms/mysql/v0/async_replication.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,6 @@ def on_async_relation_broken(self, event: RelationBrokenEvent): # noqa: C901
237237
self._charm.unit.status = BlockedStatus("Standalone read-only unit.")
238238
# reset flag to allow instances rejoining the cluster
239239
self._charm.unit_peer_data["member-state"] = "waiting"
240-
del self._charm.unit_peer_data["unit-initialized"]
241240
if not self._charm.unit.is_leader():
242241
# delay non leader to avoid `update_status` running before
243242
# leader updates app peer data
@@ -914,7 +913,6 @@ def _on_consumer_non_leader_changed(self, _):
914913
logger.debug("Reset secondary unit to allow cluster rejoin")
915914
# reset unit flag to allow cluster rejoin after primary recovery
916915
# the unit will rejoin on the next peer relation changed or update status
917-
del self._charm.unit_peer_data["unit-initialized"]
918916
self._charm.unit_peer_data["member-state"] = "waiting"
919917
self._charm.unit.status = WaitingStatus("waiting to join the cluster")
920918

lib/charms/mysql/v0/mysql.py

Lines changed: 111 additions & 484 deletions
Large diffs are not rendered by default.

lib/charms/mysql/v0/tls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def _on_set_tls_private_key(self, event: ActionEvent) -> None:
9292

9393
def _on_tls_relation_joined(self, event) -> None:
9494
"""Request certificate when TLS relation joined."""
95-
if self.charm.unit_peer_data.get("unit-initialized") != "True":
95+
if not self.charm.unit_initialized:
9696
event.defer()
9797
return
9898
self._request_certificate(None)

src/charm.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from charms.rolling_ops.v0.rollingops import RollingOpsManager
4242
from charms.tempo_k8s.v1.charm_tracing import trace_charm
4343
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer
44-
from ops import EventBase, RelationBrokenEvent, RelationCreatedEvent
44+
from ops import EventBase, RelationBrokenEvent, RelationCreatedEvent, Unit
4545
from ops.charm import RelationChangedEvent, UpdateStatusEvent
4646
from ops.main import main
4747
from ops.model import ActiveStatus, BlockedStatus, Container, MaintenanceStatus, WaitingStatus
@@ -254,6 +254,10 @@ def unit_address(self) -> str:
254254
"""Return the address of this unit."""
255255
return self._get_unit_fqdn()
256256

257+
def get_unit_address(self, unit: Unit) -> str:
258+
"""Return the address of a unit."""
259+
return self._get_unit_fqdn(unit.name)
260+
257261
def get_unit_hostname(self, unit_name: Optional[str] = None) -> str:
258262
"""Get the hostname.localdomain for a unit.
259263
@@ -298,14 +302,21 @@ def _get_primary_from_online_peer(self) -> Optional[str]:
298302

299303
def _is_unit_waiting_to_join_cluster(self) -> bool:
300304
"""Return if the unit is waiting to join the cluster."""
301-
# alternatively, we could check if the instance is configured
302-
# and have an empty performance_schema.replication_group_members table
305+
# check base conditions for join a unit to the cluster
306+
# - workload accessible
307+
# - unit waiting flag set
308+
# - unit configured (users created/unit set to be a cluster node)
309+
# - unit not node of this cluster or cluster does not report this unit as member
310+
# - cluster is initialized on any unit
303311
return (
304312
self.unit.get_container(CONTAINER_NAME).can_connect()
305313
and self.unit_peer_data.get("member-state") == "waiting"
306-
and self._mysql.is_data_dir_initialised()
307-
and not self.unit_peer_data.get("unit-initialized")
308-
and int(self.app_peer_data.get("units-added-to-cluster", 0)) > 0
314+
and self.unit_configured
315+
and (
316+
not self.unit_initialized
317+
or not self._mysql.is_instance_in_cluster(self.unit_label)
318+
)
319+
and self.cluster_initialized
309320
)
310321

311322
def join_unit_to_cluster(self) -> None:
@@ -375,7 +386,6 @@ def join_unit_to_cluster(self) -> None:
375386
return
376387

377388
# Update 'units-added-to-cluster' counter in the peer relation databag
378-
self.unit_peer_data["unit-initialized"] = "True"
379389
self.unit_peer_data["member-state"] = "online"
380390
self.unit.status = ActiveStatus(self.active_status_message)
381391
logger.debug(f"Instance {instance_label} is cluster member")
@@ -386,7 +396,7 @@ def _reconcile_pebble_layer(self, container: Container) -> None:
386396
new_layer = self._pebble_layer
387397

388398
if new_layer.services != current_layer.services:
389-
logger.info("Adding pebble layer")
399+
logger.info("Reconciling the pebble layer")
390400

391401
container.add_layer(MYSQLD_SAFE_SERVICE, new_layer, combine=True)
392402
container.replan()
@@ -438,16 +448,18 @@ def _reconcile_mysqld_exporter(
438448
self, event: RelationCreatedEvent | RelationBrokenEvent
439449
) -> None:
440450
"""Handle a COS relation created or broken event."""
441-
if not self.unit_peer_data.get("unit-initialized"):
442-
# wait unit initialization to avoid messing
443-
# with the pebble layer before the unit is initialized
444-
logger.debug("Defer reconcile mysqld exporter")
445-
event.defer()
451+
container = self.unit.get_container(CONTAINER_NAME)
452+
if not container.can_connect():
453+
# reconciliation is done on pebble ready
454+
logger.debug("Skip reconcile mysqld exporter: container not ready")
446455
return
447456

448-
self.current_event = event
457+
if not container.pebble.get_plan():
458+
# reconciliation is done on pebble ready
459+
logger.debug("Skip reconcile mysqld exporter: empty pebble layer")
460+
return
449461

450-
container = self.unit.get_container(CONTAINER_NAME)
462+
self.current_event = event
451463
self._reconcile_pebble_layer(container)
452464

453465
def _on_peer_relation_joined(self, _) -> None:
@@ -645,9 +657,9 @@ def _on_mysql_pebble_ready(self, event) -> None:
645657
logger.info("Setting up the logrotate configurations")
646658
self._mysql.setup_logrotate_config()
647659

648-
self.unit_peer_data["unit-status"] = "alive"
649660
if self._mysql.is_data_dir_initialised():
650661
# Data directory is already initialised, skip configuration
662+
self.unit.status = MaintenanceStatus("Starting mysqld")
651663
logger.debug("Data directory is already initialised, skipping configuration")
652664
self._reconcile_pebble_layer(container)
653665
return
@@ -811,7 +823,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None:
811823
def _on_database_storage_detaching(self, _) -> None:
812824
"""Handle the database storage detaching event."""
813825
# Only executes if the unit was initialised
814-
if not self.unit_peer_data.get("unit-initialized"):
826+
if not self.unit_initialized:
815827
return
816828

817829
# No need to remove the instance from the cluster if it is not a member of the cluster

src/log_rotate_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def start_log_rotate_manager(self):
3434
if (
3535
not isinstance(self.charm.unit.status, ActiveStatus)
3636
or self.charm.peers is None
37-
or self.charm.unit_peer_data.get("unit-initialized") != "True"
37+
or not self.charm.unit_initialized
3838
):
3939
return
4040

src/relations/mysql.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def _update_status(self, _) -> None:
136136
if not (relation_data := self.charm.app_peer_data.get(MYSQL_RELATION_DATA_KEY)):
137137
return
138138

139-
if not self.charm.unit_peer_data.get("unit-initialized"):
139+
if not self.charm.unit_initialized:
140140
# Skip update status for uninitialized unit
141141
return
142142

@@ -168,9 +168,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None:
168168
if not self.model.get_relation(LEGACY_MYSQL):
169169
return
170170

171-
if not self.charm._is_peer_data_set or not self.charm.unit_peer_data.get(
172-
"unit-initialized"
173-
):
171+
if not (self.charm._is_peer_data_set and self.charm.unit_initialized):
174172
# Avoid running too early
175173
logger.info("Unit not ready to set `mysql` relation data. Deferring")
176174
event.defer()
@@ -213,7 +211,7 @@ def _on_mysql_relation_created(self, event: RelationCreatedEvent) -> None: # no
213211
# and for the member to be initialized and online
214212
if (
215213
not self.charm._is_peer_data_set
216-
or not self.charm.unit_peer_data.get("unit-initialized")
214+
or not self.charm.unit_initialized
217215
or self.charm.unit_peer_data.get("member-state") != "online"
218216
):
219217
logger.info("Unit not ready to execute `mysql` relation created. Deferring")

src/relations/mysql_provider.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,13 @@ def _on_mysql_pebble_ready(self, _: PebbleReadyEvent) -> None:
197197
container_restarts = int(self.charm.unit_peer_data.get(CONTAINER_RESTARTS, "0"))
198198
self.charm.unit_peer_data[CONTAINER_RESTARTS] = str(container_restarts + 1)
199199

200-
self._configure_endpoints(None)
200+
try:
201+
self._configure_endpoints(None)
202+
except Exception:
203+
# catch all exception to avoid uncommitted databag keys
204+
# from other pebble-ready handlers succeed
205+
# see: https://github.com/canonical/mysql-k8s-operator/issues/457
206+
logger.exception("Failed to update endpoints on database provider pebble ready event")
201207

202208
def _configure_endpoints(self, _) -> None:
203209
"""Update the endpoints + read_only_endpoints."""

src/relations/mysql_root.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,7 @@ def _on_mysql_root_relation_created(self, event: RelationCreatedEvent) -> None:
154154

155155
# Wait until on-config-changed event is executed
156156
# (wait for root password to have been set) or wait until the unit is initialized
157-
if not self.charm._is_peer_data_set or not self.charm.unit_peer_data.get(
158-
"unit-initialized"
159-
):
157+
if not (self.charm._is_peer_data_set and self.charm.unit_initialized):
160158
event.defer()
161159
return
162160

src/rotate_mysql_logs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ def _rotate_mysql_logs(self, _) -> None:
4545
"""Rotate the mysql logs."""
4646
if (
4747
self.charm.peers is None
48-
or self.charm.unit_peer_data.get("unit-initialized") != "True"
49-
or not self.charm.upgrade.idle
5048
or not self.charm._mysql.is_mysqld_running()
49+
or not self.charm.unit_initialized
50+
or not self.charm.upgrade.idle
5151
):
5252
# skip when not initialized, during an upgrade, or when mysqld is not running
5353
return

tests/unit/test_charm.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def test_on_leader_elected_secrets(self):
112112
secret_data[password].isalnum() and len(secret_data[password]) == PASSWORD_LENGTH
113113
)
114114

115+
@patch("mysql_k8s_helpers.MySQL.cluster_metadata_exists", return_value=False)
115116
@patch("mysql_k8s_helpers.MySQL.rescan_cluster")
116117
@patch("charms.mysql.v0.mysql.MySQLCharmBase.active_status_message", return_value="")
117118
@patch("upgrade.MySQLK8sUpgrade.idle", return_value=True)
@@ -155,6 +156,7 @@ def test_mysql_pebble_ready(
155156
_active_status_message,
156157
_upgrade_idle,
157158
_rescan_cluster,
159+
_cluster_metadata_exists,
158160
):
159161
# Check if initial plan is empty
160162
self.harness.set_can_connect("mysql", True)
@@ -200,14 +202,12 @@ def test_pebble_ready_set_data(
200202
# test on non leader
201203
self.harness.set_leader(is_leader=False)
202204
self.harness.container_pebble_ready("mysql")
203-
self.assertEqual(self.charm.unit_peer_data.get("unit-initialized"), None)
204205
self.assertEqual(self.charm.unit_peer_data["member-role"], "secondary")
205206
self.assertEqual(self.charm.unit_peer_data["member-state"], "waiting")
206207

207208
# test on leader
208209
self.harness.set_leader(is_leader=True)
209210
self.harness.container_pebble_ready("mysql")
210-
self.assertEqual(self.charm.unit_peer_data["unit-initialized"], "True")
211211
self.assertEqual(self.charm.unit_peer_data["member-state"], "online")
212212
self.assertEqual(self.charm.unit_peer_data["member-role"], "primary")
213213

@@ -308,6 +308,7 @@ def test_set_secret_databag(self, _):
308308
== "test-password"
309309
)
310310

311+
@patch("charm.MySQLOperatorCharm.unit_initialized", return_value=True)
311312
@patch("charms.mysql.v0.mysql.MySQLBase.is_cluster_replica", return_value=False)
312313
@patch("mysql_k8s_helpers.MySQL.remove_instance")
313314
@patch("mysql_k8s_helpers.MySQL.get_primary_label")
@@ -320,10 +321,8 @@ def test_database_storage_detaching(
320321
mock_get_primary_label,
321322
mock_remove_instance,
322323
mock_is_cluster_replica,
324+
mock_unit_initialized,
323325
):
324-
self.harness.update_relation_data(
325-
self.peer_relation_id, self.charm.unit.name, {"unit-initialized": "True"}
326-
)
327326
self.harness.update_relation_data(
328327
self.peer_relation_id,
329328
self.charm.app.name,

0 commit comments

Comments
 (0)