@@ -133,7 +133,7 @@ def wait_until_mysql_connection(self) -> None:
133
133
# Increment this major API version when introducing breaking changes
134
134
LIBAPI = 0
135
135
136
- LIBPATCH = 89
136
+ LIBPATCH = 90
137
137
138
138
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
139
139
UNIT_ADD_LOCKNAME = "unit-add"
@@ -230,6 +230,10 @@ class MySQLAddInstanceToClusterError(Error):
230
230
"""Exception raised when there is an issue add an instance to the MySQL InnoDB cluster."""
231
231
232
232
233
+ class MySQLRejoinInstanceToClusterError (Error ):
234
+ """Exception raised when there is an issue rejoining an instance to the MySQL InnoDB cluster."""
235
+
236
+
233
237
class MySQLRemoveInstanceRetryError (Error ):
234
238
"""Exception raised when there is an issue removing an instance.
235
239
@@ -291,6 +295,10 @@ class MySQLRebootFromCompleteOutageError(Error):
291
295
"""Exception raised when there is an issue rebooting from complete outage."""
292
296
293
297
298
+ class MySQLForceQuorumFromInstanceError (Error ):
299
+ """Exception raised when there is an issue forcing quorum from an instance."""
300
+
301
+
294
302
class MySQLSetInstanceOfflineModeError (Error ):
295
303
"""Exception raised when there is an issue setting instance as offline."""
296
304
@@ -476,7 +484,11 @@ def __init__(self, *args):
476
484
self .framework .observe (self .on .get_cluster_status_action , self ._get_cluster_status )
477
485
self .framework .observe (self .on .get_password_action , self ._on_get_password )
478
486
self .framework .observe (self .on .set_password_action , self ._on_set_password )
487
+ self .framework .observe (self .on .promote_to_primary_action , self ._on_promote_to_primary )
479
488
self .framework .observe (self .on .recreate_cluster_action , self ._recreate_cluster )
489
+ self .framework .observe (
490
+ self .on [PEER ].relation_changed , self .check_topology_timestamp_change
491
+ )
480
492
481
493
# Set in some event handlers in order to avoid passing event down a chain
482
494
# of methods
@@ -582,6 +594,43 @@ def _get_cluster_status(self, event: ActionEvent) -> None:
582
594
"message" : "Failed to read cluster status. See logs for more information." ,
583
595
})
584
596
597
+ def _on_promote_to_primary (self , event : ActionEvent ) -> None :
598
+ """Action for setting this unit as the cluster primary."""
599
+ if event .params .get ("scope" ) != "unit" :
600
+ return
601
+
602
+ if self ._mysql .is_unit_primary (self .unit_label ):
603
+ event .set_results ({
604
+ "success" : False ,
605
+ "message" : "Unit is already primary" ,
606
+ })
607
+ return
608
+
609
+ if event .params .get ("force" ):
610
+ # Failover
611
+ logger .info ("Forcing quorum from instance" )
612
+ try :
613
+ self ._mysql .force_quorum_from_instance ()
614
+ except MySQLForceQuorumFromInstanceError :
615
+ logger .exception ("Failed to force quorum from instance" )
616
+ event .fail ("Failed to force quorum from instance. See logs for more information." )
617
+ else :
618
+ # Switchover
619
+ logger .info ("Setting unit as cluster primary" )
620
+ try :
621
+ self ._mysql .set_cluster_primary (self .get_unit_hostname ())
622
+ except MySQLSetClusterPrimaryError :
623
+ logger .exception ("Failed to set cluster primary" )
624
+ event .fail ("Failed to change cluster primary. See logs for more information." )
625
+
626
+ # Use peer relation to trigger endpoint update
627
+ # refer to mysql_provider.py
628
+ self .unit_peer_data .update ({"topology-change-timestamp" : str (int (time .time ()))})
629
+ event .set_results ({
630
+ "success" : True ,
631
+ "message" : "Unit is already primary" ,
632
+ })
633
+
585
634
def _recreate_cluster (self , event : ActionEvent ) -> None :
586
635
"""Action used to recreate the cluster, for special cases."""
587
636
if not self .unit .is_leader ():
@@ -623,6 +672,37 @@ def create_cluster(self) -> None:
623
672
624
673
self .unit_peer_data .update ({"member-state" : state , "member-role" : role })
625
674
675
+ @abstractmethod
676
+ def update_endpoints (self ) -> None :
677
+ """Update the endpoints for the cluster."""
678
+ raise NotImplementedError
679
+
680
+ def check_topology_timestamp_change (self , _ ) -> None :
681
+ """Check for cluster topology changes and trigger endpoint update if needed.
682
+
683
+ Used for trigger endpoint updates for non typical events like, add/remove unit
684
+ or update status.
685
+ """
686
+ topology_change_set = {
687
+ int (self .peers .data [unit ]["topology-change-timestamp" ])
688
+ for unit in self .peers .units
689
+ if self .peers .data [unit ].get ("topology-change-timestamp" )
690
+ }
691
+ if not topology_change_set :
692
+ # no topology change detected
693
+ return
694
+ topology_change = int (self .unit_peer_data .get ("topology-change-timestamp" , "0" ))
695
+ max_topology_change = max (topology_change_set )
696
+ if self .unit .is_leader () and max_topology_change > topology_change :
697
+ # update endpoints required
698
+ self .update_endpoints ()
699
+ return
700
+
701
+ # sync timestamp and trigger relation changed
702
+ self .unit_peer_data .update ({
703
+ "topology-change-timestamp" : str (max (max_topology_change , topology_change ))
704
+ })
705
+
626
706
@property
627
707
def peers (self ) -> Optional [ops .model .Relation ]:
628
708
"""Retrieve the peer relation."""
@@ -1941,6 +2021,27 @@ def add_instance_to_cluster(
1941
2021
# always release the lock
1942
2022
self ._release_lock (local_lock_instance , instance_unit_label , UNIT_ADD_LOCKNAME )
1943
2023
2024
+ def rejoin_instance_to_cluster (self , * , unit_label : str , from_instance : str ) -> None :
2025
+ """Rejoin an instance to the InnoDB cluster."""
2026
+ commands = (
2027
+ f"cluster = dba.get_cluster('{ self .cluster_name } ')" ,
2028
+ f"cluster.rejoin_instance('{ unit_label } ')" ,
2029
+ )
2030
+
2031
+ from_instance = from_instance or self .instance_address
2032
+
2033
+ try :
2034
+ logger .debug (f"Rejoining instance { unit_label } to cluster { self .cluster_name } " )
2035
+ self ._run_mysqlsh_script (
2036
+ "\n " .join (commands ),
2037
+ user = self .server_config_user ,
2038
+ password = self .server_config_password ,
2039
+ host = self .instance_def (self .server_config_user , from_instance ),
2040
+ )
2041
+ except MySQLClientError :
2042
+ logger .error (f"Failed to rejoin instance { unit_label } to cluster { self .cluster_name } " )
2043
+ raise MySQLRejoinInstanceToClusterError
2044
+
1944
2045
def is_instance_configured_for_innodb (
1945
2046
self , instance_address : str , instance_unit_label : str
1946
2047
) -> bool :
@@ -2069,6 +2170,31 @@ def is_instance_in_cluster(self, unit_label: str) -> bool:
2069
2170
)
2070
2171
return False
2071
2172
2173
+ def instance_belongs_to_cluster (self , unit_label : str ) -> bool :
2174
+ """Check if instance belongs to cluster independently of current state.
2175
+
2176
+ Args:
2177
+ unit_label: The label of the unit to check.
2178
+ """
2179
+ query = (
2180
+ "SELECT instance_id FROM mysql_innodb_cluster_metadata.instances WHERE cluster_id ="
2181
+ "(SELECT cluster_id FROM mysql_innodb_cluster_metadata.clusters WHERE cluster_name ="
2182
+ f" '{ self .cluster_name } ') AND instance_name = '{ unit_label } ';" ,
2183
+ )
2184
+
2185
+ try :
2186
+ output = self ._run_mysqlcli_script (
2187
+ query ,
2188
+ user = self .server_config_user ,
2189
+ password = self .server_config_password ,
2190
+ )
2191
+ except MySQLClientError :
2192
+ logger .debug (
2193
+ "Instance has no cluster metadata, assuming it does not belong to any cluster."
2194
+ )
2195
+ return False
2196
+ return len (output ) == 1
2197
+
2072
2198
@retry (
2073
2199
wait = wait_fixed (2 ),
2074
2200
stop = stop_after_attempt (3 ),
@@ -2218,14 +2344,23 @@ def execute_remove_instance(
2218
2344
wait = wait_random (min = 4 , max = 30 ),
2219
2345
)
2220
2346
def remove_instance ( # noqa: C901
2221
- self , unit_label : str , lock_instance : Optional [str ] = None
2347
+ self ,
2348
+ unit_label : str ,
2349
+ lock_instance : Optional [str ] = None ,
2350
+ auto_dissolve : Optional [bool ] = True ,
2222
2351
) -> None :
2223
2352
"""Remove instance from the cluster.
2224
2353
2225
2354
This method is called from each unit being torn down, thus we must obtain
2226
2355
locks on the cluster primary. There is a retry mechanism for any issues
2227
2356
obtaining the lock, removing instances/dissolving the cluster, or releasing
2228
2357
the lock.
2358
+
2359
+ Args:
2360
+ unit_label: The label of the unit to remove.
2361
+ lock_instance: (optional) The instance address to acquire the lock on.
2362
+ auto_dissolve: (optional) Whether to automatically dissolve the cluster
2363
+ if this is the last instance in the cluster.
2229
2364
"""
2230
2365
remaining_cluster_member_addresses = []
2231
2366
skip_release_lock = False
@@ -2263,7 +2398,8 @@ def remove_instance( # noqa: C901
2263
2398
self .remove_replica_cluster (self .cluster_name )
2264
2399
else :
2265
2400
skip_release_lock = True
2266
- self .dissolve_cluster ()
2401
+ if auto_dissolve :
2402
+ self .dissolve_cluster ()
2267
2403
2268
2404
else :
2269
2405
# Get remaining cluster member addresses before calling mysqlsh.remove_instance()
@@ -2314,7 +2450,7 @@ def remove_instance( # noqa: C901
2314
2450
2315
2451
def dissolve_cluster (self ) -> None :
2316
2452
"""Dissolve the cluster independently of the unit teardown process."""
2317
- logger .debug (f"Dissolving cluster { self .cluster_name } " )
2453
+ logger .info (f"Dissolving cluster { self .cluster_name } " )
2318
2454
dissolve_cluster_commands = (
2319
2455
f"cluster = dba.get_cluster('{ self .cluster_name } ')" ,
2320
2456
"cluster.dissolve({'force': 'true'})" ,
@@ -2745,6 +2881,27 @@ def start_group_replication(self) -> None:
2745
2881
except MySQLClientError :
2746
2882
logger .warning ("Failed to start Group Replication for unit" )
2747
2883
2884
+ def force_quorum_from_instance (self ) -> None :
2885
+ """Force quorum from the current instance.
2886
+
2887
+ Recovery for cases where majority loss put the cluster in defunct state.
2888
+ """
2889
+ force_quorum_command = (
2890
+ f"cluster = dba.get_cluster('{ self .cluster_name } ')" ,
2891
+ "cluster.force_quorum_using_partition_of()" ,
2892
+ )
2893
+
2894
+ try :
2895
+ self ._run_mysqlsh_script (
2896
+ "\n " .join (force_quorum_command ),
2897
+ user = self .server_config_user ,
2898
+ password = self .server_config_password ,
2899
+ host = self .instance_def (self .server_config_user ),
2900
+ )
2901
+ except MySQLClientError :
2902
+ logger .error ("Failed to force quorum from instance" )
2903
+ raise MySQLForceQuorumFromInstanceError
2904
+
2748
2905
def reboot_from_complete_outage (self ) -> None :
2749
2906
"""Wrapper for reboot_cluster_from_complete_outage command."""
2750
2907
reboot_from_outage_command = (
0 commit comments