Skip to content

Commit d97adc2

Browse files
[DPE-4118] Address drained units rejoining the cluster with a new PV (#433)
* Address drained units rejoining the cluster with a new PV * WIP: Address pod eviction and pvc deletion during node drain * Address PR feedback + add integration test for node drain * Fix failing unit test * Add missing kwarg used for get_cluster_status * Fix various bugs related to retrieving cluster name * Update data_interfacese charm lib + pull in PR version of mysql charm lib * Pull in latest version of the mysql charm lib * Adjustments based on the implementation of MySQL.cluster_metadata_exists * Remove instance explicitly and rescan to clean stale user accounts * Update the mysql lib to test latest changes in the vm charm * Force remove instances during node drain * Update all outdate charm libs
1 parent ee7b04c commit d97adc2

File tree

12 files changed

+320
-69
lines changed

12 files changed

+320
-69
lines changed

lib/charms/data_platform_libs/v0/data_interfaces.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent):
331331

332332
# Increment this PATCH version before using `charmcraft publish-lib` or reset
333333
# to 0 if you are raising the major API version
334-
LIBPATCH = 37
334+
LIBPATCH = 38
335335

336336
PYDEPS = ["ops>=2.0.0"]
337337

@@ -2606,6 +2606,14 @@ def set_version(self, relation_id: int, version: str) -> None:
26062606
"""
26072607
self.update_relation_data(relation_id, {"version": version})
26082608

2609+
def set_subordinated(self, relation_id: int) -> None:
2610+
"""Raises the subordinated flag in the application relation databag.
2611+
2612+
Args:
2613+
relation_id: the identifier for a particular relation.
2614+
"""
2615+
self.update_relation_data(relation_id, {"subordinated": "true"})
2616+
26092617

26102618
class DatabaseProviderEventHandlers(EventHandlers):
26112619
"""Provider-side of the database relation handlers."""
@@ -2842,6 +2850,21 @@ def _on_relation_created_event(self, event: RelationCreatedEvent) -> None:
28422850

28432851
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
28442852
"""Event emitted when the database relation has changed."""
2853+
is_subordinate = False
2854+
remote_unit_data = None
2855+
for key in event.relation.data.keys():
2856+
if isinstance(key, Unit) and not key.name.startswith(self.charm.app.name):
2857+
remote_unit_data = event.relation.data[key]
2858+
elif isinstance(key, Application) and key.name != self.charm.app.name:
2859+
is_subordinate = event.relation.data[key].get("subordinated") == "true"
2860+
2861+
if is_subordinate:
2862+
if not remote_unit_data:
2863+
return
2864+
2865+
if remote_unit_data.get("state") != "ready":
2866+
return
2867+
28452868
# Check which data has changed to emit customs events.
28462869
diff = self._diff(event)
28472870

lib/charms/mysql/v0/async_replication.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
# The unique Charmhub library identifier, never change it
5555
LIBID = "4de21f1a022c4e2c87ac8e672ec16f6a"
5656
LIBAPI = 0
57-
LIBPATCH = 4
57+
LIBPATCH = 5
5858

5959
RELATION_OFFER = "replication-offer"
6060
RELATION_CONSUMER = "replication"
@@ -248,8 +248,6 @@ def on_async_relation_broken(self, event: RelationBrokenEvent): # noqa: C901
248248
"\tThe cluster can be recreated with the `recreate-cluster` action.\n"
249249
"\tAlternatively the cluster can be rejoined to the cluster set."
250250
)
251-
# reset the cluster node count flag
252-
del self._charm.app_peer_data["units-added-to-cluster"]
253251
# set flag to persist removed from cluster-set state
254252
self._charm.app_peer_data["removed-from-cluster-set"] = "true"
255253

@@ -834,8 +832,6 @@ def _on_consumer_changed(self, event): # noqa: C901
834832
self._charm.unit.status = MaintenanceStatus("Dissolving replica cluster")
835833
logger.info("Dissolving replica cluster")
836834
self._charm._mysql.dissolve_cluster()
837-
# reset the cluster node count flag
838-
del self._charm.app_peer_data["units-added-to-cluster"]
839835
# reset force rejoin-secondaries flag
840836
del self._charm.app_peer_data["rejoin-secondaries"]
841837

@@ -869,11 +865,6 @@ def _on_consumer_changed(self, event): # noqa: C901
869865
if cluster_set_domain_name := self._charm._mysql.get_cluster_set_name():
870866
self._charm.app_peer_data["cluster-set-domain-name"] = cluster_set_domain_name
871867

872-
# set the number of units added to the cluster for a single unit replica cluster
873-
# needed here since it will skip the `RECOVERING` state
874-
if self._charm.app.planned_units() == 1:
875-
self._charm.app_peer_data["units-added-to-cluster"] = "1"
876-
877868
self._charm._on_update_status(None)
878869
elif state == States.RECOVERING:
879870
# recovering cluster (copying data and/or joining units)
@@ -882,10 +873,6 @@ def _on_consumer_changed(self, event): # noqa: C901
882873
"Waiting for recovery to complete on other units"
883874
)
884875
logger.debug("Awaiting other units to join the cluster")
885-
# reset the number of units added to the cluster
886-
# this will trigger secondaries to join the cluster
887-
node_count = self._charm._mysql.get_cluster_node_count()
888-
self._charm.app_peer_data["units-added-to-cluster"] = str(node_count)
889876
# set state flags to allow secondaries to join the cluster
890877
self._charm.unit_peer_data["member-state"] = "online"
891878
self._charm.unit_peer_data["member-role"] = "primary"

lib/charms/mysql/v0/mysql.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def wait_until_mysql_connection(self) -> None:
128128
# Increment this major API version when introducing breaking changes
129129
LIBAPI = 0
130130

131-
LIBPATCH = 62
131+
LIBPATCH = 64
132132

133133
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
134134
UNIT_ADD_LOCKNAME = "unit-add"
@@ -589,7 +589,6 @@ def create_cluster(self) -> None:
589589
# rescan cluster for cleanup of unused
590590
# recovery users
591591
self._mysql.rescan_cluster()
592-
self.app_peer_data["units-added-to-cluster"] = "1"
593592

594593
state, role = self._mysql.get_member_state()
595594

@@ -1779,6 +1778,27 @@ def _get_host_ip(host: str) -> str:
17791778

17801779
return ",".join(rw_endpoints), ",".join(ro_endpoints), ",".join(no_endpoints)
17811780

1781+
def execute_remove_instance(
1782+
self, connect_instance: Optional[str] = None, force: bool = False
1783+
) -> None:
1784+
"""Execute the remove_instance() script with mysqlsh.
1785+
1786+
Args:
1787+
connect_instance: (optional) The instance from where to run the remove_instance()
1788+
force: (optional) Whether to force the removal of the instance
1789+
"""
1790+
remove_instance_options = {
1791+
"password": self.cluster_admin_password,
1792+
"force": "true" if force else "false",
1793+
}
1794+
remove_instance_commands = (
1795+
f"shell.connect('{self.cluster_admin_user}:{self.cluster_admin_password}@{connect_instance or self.instance_address}')",
1796+
f"cluster = dba.get_cluster('{self.cluster_name}')",
1797+
"cluster.remove_instance("
1798+
f"'{self.cluster_admin_user}@{self.instance_address}', {remove_instance_options})",
1799+
)
1800+
self._run_mysqlsh_script("\n".join(remove_instance_commands))
1801+
17821802
@retry(
17831803
retry=retry_if_exception_type(MySQLRemoveInstanceRetryError),
17841804
stop=stop_after_attempt(15),
@@ -1842,17 +1862,7 @@ def remove_instance(self, unit_label: str, lock_instance: Optional[str] = None)
18421862
)
18431863

18441864
# Just remove instance
1845-
remove_instance_options = {
1846-
"password": self.cluster_admin_password,
1847-
"force": "true",
1848-
}
1849-
remove_instance_commands = (
1850-
f"shell.connect('{self.cluster_admin_user}:{self.cluster_admin_password}@{self.instance_address}')",
1851-
f"cluster = dba.get_cluster('{self.cluster_name}')",
1852-
"cluster.remove_instance("
1853-
f"'{self.cluster_admin_user}@{self.instance_address}', {remove_instance_options})",
1854-
)
1855-
self._run_mysqlsh_script("\n".join(remove_instance_commands))
1865+
self.execute_remove_instance(force=True)
18561866
except MySQLClientError as e:
18571867
# In case of an error, raise an error and retry
18581868
logger.warning(

lib/charms/mysql/v0/tls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
LIBID = "eb73947deedd4380a3a90d527e0878eb"
5454
LIBAPI = 0
55-
LIBPATCH = 5
55+
LIBPATCH = 6
5656

5757
SCOPE = "unit"
5858

poetry.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/charm.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,17 @@
4141
from charms.rolling_ops.v0.rollingops import RollingOpsManager
4242
from charms.tempo_k8s.v1.charm_tracing import trace_charm
4343
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer
44-
from ops import EventBase, RelationBrokenEvent, RelationCreatedEvent, Unit
44+
from ops import EventBase, RelationBrokenEvent, RelationCreatedEvent
4545
from ops.charm import RelationChangedEvent, UpdateStatusEvent
4646
from ops.main import main
47-
from ops.model import ActiveStatus, BlockedStatus, Container, MaintenanceStatus, WaitingStatus
47+
from ops.model import (
48+
ActiveStatus,
49+
BlockedStatus,
50+
Container,
51+
MaintenanceStatus,
52+
Unit,
53+
WaitingStatus,
54+
)
4855
from ops.pebble import Layer
4956

5057
from config import CharmConfig, MySQLConfig
@@ -189,7 +196,7 @@ def tracing_endpoint(self) -> Optional[str]:
189196
def _mysql(self) -> MySQL:
190197
"""Returns an instance of the MySQL object from mysql_k8s_helpers."""
191198
return MySQL(
192-
self._get_unit_fqdn(),
199+
self.get_unit_address(),
193200
self.app_peer_data["cluster-name"],
194201
self.app_peer_data["cluster-set-domain-name"],
195202
self.get_secret("app", ROOT_PASSWORD_KEY), # pyright: ignore [reportArgumentType]
@@ -252,11 +259,7 @@ def restart_peers(self) -> Optional[ops.model.Relation]:
252259
@property
253260
def unit_address(self) -> str:
254261
"""Return the address of this unit."""
255-
return self._get_unit_fqdn()
256-
257-
def get_unit_address(self, unit: Unit) -> str:
258-
"""Return the address of a unit."""
259-
return self._get_unit_fqdn(unit.name)
262+
return self.get_unit_address()
260263

261264
def get_unit_hostname(self, unit_name: Optional[str] = None) -> str:
262265
"""Get the hostname.localdomain for a unit.
@@ -272,17 +275,15 @@ def get_unit_hostname(self, unit_name: Optional[str] = None) -> str:
272275
unit_name = unit_name or self.unit.name
273276
return f"{unit_name.replace('/', '-')}.{self.app.name}-endpoints"
274277

275-
def _get_unit_fqdn(self, unit_name: Optional[str] = None) -> str:
276-
"""Create a fqdn for a unit.
278+
def get_unit_address(self, unit: Optional[Unit] = None) -> str:
279+
"""Get fqdn/address for a unit.
277280
278281
Translate juju unit name to resolvable hostname.
279-
280-
Args:
281-
unit_name: unit name
282-
Returns:
283-
A string representing the fqdn of the unit.
284282
"""
285-
return getfqdn(self.get_unit_hostname(unit_name))
283+
if not unit:
284+
unit = self.unit
285+
286+
return getfqdn(self.get_unit_hostname(unit.name))
286287

287288
def is_unit_busy(self) -> bool:
288289
"""Returns whether the unit is busy."""
@@ -294,7 +295,7 @@ def _get_primary_from_online_peer(self) -> Optional[str]:
294295
if self.peers.data[unit].get("member-state") == "online":
295296
try:
296297
return self._mysql.get_cluster_primary_address(
297-
connect_instance_address=self._get_unit_fqdn(unit.name),
298+
connect_instance_address=self.get_unit_address(unit),
298299
)
299300
except MySQLGetClusterPrimaryAddressError:
300301
# try next unit
@@ -325,7 +326,7 @@ def join_unit_to_cluster(self) -> None:
325326
Try to join the unit from the primary unit.
326327
"""
327328
instance_label = self.unit.name.replace("/", "-")
328-
instance_address = self._get_unit_fqdn(self.unit.name)
329+
instance_address = self.get_unit_address(self.unit)
329330

330331
if not self._mysql.is_instance_in_cluster(instance_label):
331332
# Add new instance to the cluster
@@ -370,6 +371,21 @@ def join_unit_to_cluster(self) -> None:
370371
# Stop GR for cases where the instance was previously part of the cluster
371372
# harmless otherwise
372373
self._mysql.stop_group_replication()
374+
375+
# If instance already in cluster, before adding instance to cluster,
376+
# remove the instance from the cluster and call rescan_cluster()
377+
# without adding/removing instances to clean up stale users
378+
if (
379+
instance_label
380+
in self._mysql.get_cluster_status(from_instance=cluster_primary)[
381+
"defaultreplicaset"
382+
]["topology"].keys()
383+
):
384+
self._mysql.execute_remove_instance(
385+
connect_instance=cluster_primary, force=True
386+
)
387+
self._mysql.rescan_cluster(from_instance=cluster_primary)
388+
373389
self._mysql.add_instance_to_cluster(
374390
instance_address=instance_address,
375391
instance_unit_label=instance_label,
@@ -385,7 +401,6 @@ def join_unit_to_cluster(self) -> None:
385401
logger.debug("waiting: failed to acquire lock when adding instance to cluster")
386402
return
387403

388-
# Update 'units-added-to-cluster' counter in the peer relation databag
389404
self.unit_peer_data["member-state"] = "online"
390405
self.unit.status = ActiveStatus(self.active_status_message)
391406
logger.debug(f"Instance {instance_label} is cluster member")
@@ -669,7 +684,7 @@ def _on_mysql_pebble_ready(self, event) -> None:
669684
# First run setup
670685
self._configure_instance(container)
671686

672-
if not self.unit.is_leader():
687+
if not self.unit.is_leader() or self.cluster_initialized:
673688
# Non-leader units try to join cluster
674689
self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
675690
self.unit_peer_data.update({"member-role": "secondary", "member-state": "waiting"})
@@ -793,10 +808,6 @@ def _on_update_status(self, _: Optional[UpdateStatusEvent]) -> None:
793808

794809
def _set_app_status(self) -> None:
795810
"""Set the application status based on the cluster state."""
796-
nodes = self._mysql.get_cluster_node_count()
797-
if nodes > 0:
798-
self.app_peer_data["units-added-to-cluster"] = str(nodes)
799-
800811
try:
801812
primary_address = self._mysql.get_cluster_primary_address()
802813
except MySQLGetClusterPrimaryAddressError:
@@ -838,7 +849,7 @@ def _on_database_storage_detaching(self, _) -> None:
838849
logger.info("Switching primary to unit 0")
839850
try:
840851
self._mysql.set_cluster_primary(
841-
new_primary_address=self._get_unit_fqdn(f"{self.app.name}/0")
852+
new_primary_address=getfqdn(self.get_unit_hostname(f"{self.app.name}/0"))
842853
)
843854
except MySQLSetClusterPrimaryError:
844855
logger.warning("Failed to switch primary to unit 0")

src/upgrade.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import json
77
import logging
8+
from socket import getfqdn
89
from typing import TYPE_CHECKING
910

1011
from charms.data_platform_libs.v0.upgrade import (
@@ -174,12 +175,12 @@ def _pre_upgrade_prepare(self) -> None:
174175
"""
175176
if self.charm._mysql.get_primary_label() != f"{self.charm.app.name}-0":
176177
# set the primary to the first unit for switchover mitigation
177-
new_primary = self.charm._get_unit_fqdn(f"{self.charm.app.name}/0")
178+
new_primary = getfqdn(self.charm.get_unit_hostname(f"{self.charm.app.name}/0"))
178179
self.charm._mysql.set_cluster_primary(new_primary)
179180

180181
# set slow shutdown on all instances
181182
for unit in self.app_units:
182-
unit_address = self.charm._get_unit_fqdn(unit.name)
183+
unit_address = self.charm.get_unit_address(unit)
183184
self.charm._mysql.set_dynamic_variable(
184185
variable="innodb_fast_shutdown", value="0", instance_address=unit_address
185186
)
@@ -293,9 +294,7 @@ def _complete_upgrade(self):
293294
if self.charm.unit_label == f"{self.charm.app.name}/1":
294295
# penultimate unit, reset the primary for faster switchover
295296
try:
296-
self.charm._mysql.set_cluster_primary(
297-
self.charm._get_unit_fqdn(self.charm.unit.name)
298-
)
297+
self.charm._mysql.set_cluster_primary(self.charm.get_unit_address(self.charm.unit))
299298
except MySQLSetClusterPrimaryError:
300299
logger.debug("Failed to set primary")
301300

@@ -322,7 +321,7 @@ def _check_server_upgradeability(self) -> None:
322321
if len(self.upgrade_stack or []) < self.charm.app.planned_units():
323322
# check is done for 1st upgrading unit
324323
return
325-
instance = self.charm._get_unit_fqdn(f"{self.charm.app.name}/0")
324+
instance = getfqdn(self.charm.get_unit_hostname(f"{self.charm.app.name}/0"))
326325
self.charm._mysql.verify_server_upgradable(instance=instance)
327326
logger.debug("MySQL server is upgradeable")
328327

0 commit comments

Comments
 (0)