@@ -19,6 +19,7 @@ package controllers
1919import (
2020 "bytes"
2121 "context"
22+ "encoding/json"
2223 "fmt"
2324 "sort"
2425 "strconv"
@@ -94,20 +95,31 @@ func GetLog(ctx context.Context, controller string) logr.Logger {
9495//
9596
9697// findBestCandidate returns the node with the lowest seqno
97- func findBestCandidate (status * mariadbv1.GaleraStatus ) string {
98- sortednodes := maps .Keys (status .Attributes )
98+ func findBestCandidate (g * mariadbv1.Galera ) ( node string , found bool ) {
99+ sortednodes := maps .Keys (g . Status .Attributes )
99100 sort .Strings (sortednodes )
100101 bestnode := ""
101102 bestseqno := - 1
102103 for _ , node := range sortednodes {
103- seqno := status .Attributes [node ].Seqno
104+ // On clean shutdown, galera sets the last
105+ // stopped node as 'safe to bootstrap', so use
106+ // this hint when we can
107+ if g .Status .Attributes [node ].SafeToBootstrap {
108+ return node , true
109+ }
110+ seqno := g .Status .Attributes [node ].Seqno
104111 intseqno , _ := strconv .Atoi (seqno )
105112 if intseqno >= bestseqno {
106113 bestnode = node
107114 bestseqno = intseqno
108115 }
109116 }
110- return bestnode //"galera-0"
117+ // if we pass here, a candidate is only valid if we
118+ // inspected all the expected replicas (e.g. typically 3)
119+ if len (g .Status .Attributes ) != int (* g .Spec .Replicas ) {
120+ return "" , false
121+ }
122+ return bestnode , true //"galera-0"
111123}
112124
113125// buildGcommURI builds a gcomm URI for a galera instance
@@ -240,18 +252,22 @@ func injectGcommURI(ctx context.Context, h *helper.Helper, config *rest.Config,
240252}
241253
242254// retrieveSequenceNumber probes a pod's galera instance for sequence number
243- func retrieveSequenceNumber (ctx context.Context , helper * helper.Helper , config * rest.Config , instance * mariadbv1.Galera , pod * corev1.Pod ) error {
244- err := mariadb .ExecInPod (ctx , helper , config , instance .Namespace , pod .Name , "galera" ,
255+ func retrieveSequenceNumber (ctx context.Context , helper * helper.Helper , config * rest.Config , instance * mariadbv1.Galera , pod * corev1.Pod ) (errStr []string , err error ) {
256+ errStr = nil
257+ err = mariadb .ExecInPod (ctx , helper , config , instance .Namespace , pod .Name , "galera" ,
245258 []string {"/bin/bash" , "/var/lib/operator-scripts/detect_last_commit.sh" },
246- func (stdout * bytes.Buffer , _ * bytes.Buffer ) error {
247- seqno := strings .TrimSuffix (stdout .String (), "\n " )
248- attr := mariadbv1.GaleraAttributes {
249- Seqno : seqno ,
259+ func (stdout * bytes.Buffer , stderr * bytes.Buffer ) error {
260+ var attr mariadbv1.GaleraAttributes
261+ if err := json .Unmarshal (stdout .Bytes (), & attr ); err != nil {
262+ return err
263+ }
264+ if stderr .Len () > 0 {
265+ errStr = strings .Split (strings .TrimSuffix (stderr .String (), "\n " ), "\n " )
250266 }
251267 instance .Status .Attributes [pod .Name ] = attr
252268 return nil
253269 })
254- return err
270+ return
255271}
256272
257273// clearPodAttributes clears information known by the operator about a pod
@@ -737,7 +753,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
737753 for _ , pod := range getReadyPods (podList .Items ) {
738754 name := pod .Name
739755 if _ , found := instance .Status .Attributes [name ]; found {
740- log .Info ("Galera started on " , "pod" , pod . Name )
756+ log .Info ("Galera started" , "pod" , name )
741757 clearPodAttributes (instance , name )
742758 }
743759 }
@@ -777,21 +793,36 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
777793 // . any other status means the the pod is starting/restarting. We can't
778794 // exec into the pod yet, so we will probe it in another reconcile loop.
779795 if ! instance .Status .Bootstrapped && ! isBootstrapInProgress (instance ) {
796+ var node string
797+ found := false
780798 for _ , pod := range getRunningPodsMissingAttributes (ctx , podList .Items , instance , helper , r .config ) {
781799 name := pod .Name
782800 util .LogForObject (helper , fmt .Sprintf ("Pod %s running, retrieve seqno" , name ), instance )
783- err := retrieveSequenceNumber (ctx , helper , r .config , instance , & pod )
801+ warn , err := retrieveSequenceNumber (ctx , helper , r .config , instance , & pod )
802+ if len (warn ) > 0 {
803+ util .LogForObject (helper , fmt .Sprintf ("Warning: %q" , warn ), instance )
804+ }
784805 if err != nil {
785- log .Error (err , "Failed to retrieve seqno for " , " name" , name )
806+ log .Error (err , fmt . Sprintf ( "Failed to retrieve seqno for %s " , name ) )
786807 return ctrl.Result {}, err
787808 }
788- log .Info ("" , "Pod" , name , "seqno:" , instance .Status .Attributes [name ].Seqno )
809+ log .Info (fmt .Sprintf ("Attributes retrieved for %s" , name ),
810+ "UUID" , instance .Status .Attributes [name ].UUID ,
811+ "Seqno" , instance .Status .Attributes [name ].Seqno ,
812+ "SafeToBootstrap" , instance .Status .Attributes [name ].SafeToBootstrap ,
813+ )
814+ if instance .Status .Attributes [name ].SafeToBootstrap {
815+ node = name
816+ found = true
817+ break
818+ }
789819 }
790820
791821 // Check if we have enough info to bootstrap the cluster now
792- if (len (instance .Status .Attributes ) > 0 ) &&
793- (len (instance .Status .Attributes ) == len (podList .Items )) {
794- node := findBestCandidate (& instance .Status )
822+ if ! found {
823+ node , found = findBestCandidate (instance )
824+ }
825+ if found {
795826 pod := getPodFromName (podList .Items , node )
796827 log .Info ("Pushing gcomm URI to bootstrap" , "pod" , node )
797828 // Setting the gcomm attribute marks this pod as 'currently bootstrapping the cluster'
0 commit comments