28
28
MySQLConfigureMySQLUsersError ,
29
29
MySQLCreateClusterError ,
30
30
MySQLGetClusterPrimaryAddressError ,
31
- MySQLGetMemberStateError ,
32
31
MySQLGetMySQLVersionError ,
33
32
MySQLInitializeJujuOperationsTableError ,
34
33
MySQLLockAcquisitionError ,
34
+ MySQLNoMemberStateError ,
35
35
MySQLRebootFromCompleteOutageError ,
36
36
MySQLServiceNotRunningError ,
37
37
MySQLSetClusterPrimaryError ,
38
+ MySQLUnableToGetMemberStateError ,
38
39
)
39
40
from charms .mysql .v0 .tls import MySQLTLS
40
41
from charms .prometheus_k8s .v0 .prometheus_scrape import MetricsEndpointProvider
@@ -700,7 +701,12 @@ def _on_mysql_pebble_ready(self, event) -> None:
700
701
# First run setup
701
702
self ._configure_instance (container )
702
703
703
- if not self .unit .is_leader () or self .cluster_initialized :
704
+ # We consider cluster initialized only if a primary already exists
705
+ # (as there can be metadata in the database but no primary if pod
706
+ # crashes while cluster is being created)
707
+ if not self .unit .is_leader () or (
708
+ self .cluster_initialized and self ._get_primary_from_online_peer ()
709
+ ):
704
710
# Non-leader units try to join cluster
705
711
self .unit .status = WaitingStatus ("Waiting for instance to join the cluster" )
706
712
self .unit_peer_data .update ({"member-role" : "secondary" , "member-state" : "waiting" })
@@ -710,12 +716,14 @@ def _on_mysql_pebble_ready(self, event) -> None:
710
716
try :
711
717
# Create the cluster when is the leader unit
712
718
logger .info (f"Creating cluster { self .app_peer_data ['cluster-name' ]} " )
719
+ self .unit .status = MaintenanceStatus ("Creating cluster" )
713
720
self .create_cluster ()
714
721
self .unit .status = ops .ActiveStatus (self .active_status_message )
715
722
716
723
except (
717
724
MySQLCreateClusterError ,
718
- MySQLGetMemberStateError ,
725
+ MySQLUnableToGetMemberStateError ,
726
+ MySQLNoMemberStateError ,
719
727
MySQLInitializeJujuOperationsTableError ,
720
728
MySQLCreateClusterError ,
721
729
):
@@ -728,19 +736,24 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
728
736
Returns:
729
737
bool indicating whether the caller should return
730
738
"""
731
- if not self .cluster_initialized or not self .unit_peer_data .get ("member-role" ):
732
- # health checks are only after cluster and members are initialized
739
+ if not self ._mysql .is_mysqld_running ():
733
740
return True
734
741
735
- if not self ._mysql .is_mysqld_running ():
742
+ only_single_unitialized_node_across_cluster = (
743
+ self .only_one_cluster_node_thats_uninitialized
744
+ )
745
+
746
+ if (
747
+ not self .cluster_initialized and not only_single_unitialized_node_across_cluster
748
+ ) or not self .unit_peer_data .get ("member-role" ):
736
749
return True
737
750
738
751
# retrieve and persist state for every unit
739
752
try :
740
753
state , role = self ._mysql .get_member_state ()
741
754
self .unit_peer_data ["member-state" ] = state
742
755
self .unit_peer_data ["member-role" ] = role
743
- except MySQLGetMemberStateError :
756
+ except ( MySQLNoMemberStateError , MySQLUnableToGetMemberStateError ) :
744
757
logger .error ("Error getting member state. Avoiding potential cluster crash recovery" )
745
758
self .unit .status = MaintenanceStatus ("Unable to get member state" )
746
759
return True
@@ -757,23 +770,33 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
757
770
if state == "recovering" :
758
771
return True
759
772
760
- if state in [ "offline" ] :
773
+ if state == "offline" :
761
774
# Group Replication is active but the member does not belong to any group
762
775
all_states = {
763
776
self .peers .data [unit ].get ("member-state" , "unknown" ) for unit in self .peers .units
764
777
}
765
- # Add state for this unit (self.peers.units does not include this unit)
766
- all_states .add ("offline" )
767
778
768
- if all_states == {"offline" } and self .unit .is_leader ():
779
+ # Add state 'offline' for this unit (self.peers.unit does not
780
+ # include this unit)
781
+ if (all_states | {"offline" } == {"offline" } and self .unit .is_leader ()) or (
782
+ only_single_unitialized_node_across_cluster and all_states == {"waiting" }
783
+ ):
769
784
# All instance are off, reboot cluster from outage from the leader unit
770
785
771
786
logger .info ("Attempting reboot from complete outage." )
772
787
try :
773
- self ._mysql .reboot_from_complete_outage ()
788
+ # Need condition to avoid rebooting on all units of application
789
+ if self .unit .is_leader () or only_single_unitialized_node_across_cluster :
790
+ self ._mysql .reboot_from_complete_outage ()
774
791
except MySQLRebootFromCompleteOutageError :
775
792
logger .error ("Failed to reboot cluster from complete outage." )
776
- self .unit .status = BlockedStatus ("failed to recover cluster." )
793
+
794
+ if only_single_unitialized_node_across_cluster and all_states == {"waiting" }:
795
+ self ._mysql .drop_group_replication_metadata_schema ()
796
+ self .create_cluster ()
797
+ self .unit .status = ActiveStatus (self .active_status_message )
798
+ else :
799
+ self .unit .status = BlockedStatus ("failed to recover cluster." )
777
800
778
801
return True
779
802
@@ -785,10 +808,23 @@ def _is_cluster_blocked(self) -> bool:
785
808
Returns: a boolean indicating whether the update-status (caller) should
786
809
no-op and return.
787
810
"""
788
- unit_member_state = self .unit_peer_data .get ("member-state" )
789
- if unit_member_state in ["waiting" , "restarting" ]:
811
+ # We need to query member state from the server since member state would
812
+ # be 'offline' if pod rescheduled during cluster creation, however
813
+ # member-state in the unit peer databag will be 'waiting'
814
+ member_state_exists = True
815
+ try :
816
+ member_state , _ = self ._mysql .get_member_state ()
817
+ except MySQLUnableToGetMemberStateError :
818
+ logger .error ("Error getting member state while checking if cluster is blocked" )
819
+ self .unit .status = MaintenanceStatus ("Unable to get member state" )
820
+ return True
821
+ except MySQLNoMemberStateError :
822
+ member_state_exists = False
823
+
824
+ if not member_state_exists or member_state == "restarting" :
790
825
# avoid changing status while tls is being set up or charm is being initialized
791
- logger .info (f"Unit state is { unit_member_state } " )
826
+ logger .info ("Unit is waiting or restarting" )
827
+ logger .debug (f"{ member_state_exists = } , { member_state = } " )
792
828
return True
793
829
794
830
# avoid changing status while async replication is setting up
@@ -812,6 +848,7 @@ def _on_update_status(self, _: Optional[UpdateStatusEvent]) -> None:
812
848
813
849
container = self .unit .get_container (CONTAINER_NAME )
814
850
if not container .can_connect ():
851
+ logger .debug ("Cannot connect to pebble in the mysql container" )
815
852
return
816
853
817
854
if self ._handle_potential_cluster_crash_scenario ():
0 commit comments