@@ -19,6 +19,7 @@ package controllers
1919import (
2020 "bytes"
2121 "context"
22+ "encoding/json"
2223 "fmt"
2324 "sort"
2425 "strconv"
@@ -94,20 +95,31 @@ func GetLog(ctx context.Context, controller string) logr.Logger {
9495//
9596
9697// findBestCandidate returns the node with the lowest seqno
97- func findBestCandidate (status * mariadbv1.GaleraStatus ) string {
98- sortednodes := maps .Keys (status .Attributes )
98+ func findBestCandidate (g * mariadbv1.Galera ) ( node string , found bool ) {
99+ sortednodes := maps .Keys (g . Status .Attributes )
99100 sort .Strings (sortednodes )
100101 bestnode := ""
101102 bestseqno := - 1
102103 for _ , node := range sortednodes {
103- seqno := status .Attributes [node ].Seqno
104+ // On clean shutdown, galera sets the last
105+ // stopped node as 'safe to bootstrap', so use
106+ // this hint when we can
107+ if g .Status .Attributes [node ].SafeToBootstrap {
108+ return node , true
109+ }
110+ seqno := g .Status .Attributes [node ].Seqno
104111 intseqno , _ := strconv .Atoi (seqno )
105112 if intseqno >= bestseqno {
106113 bestnode = node
107114 bestseqno = intseqno
108115 }
109116 }
110- return bestnode //"galera-0"
117+ // if we pass here, a candidate is only valid if we
118+ // inspected all the expected replicas (e.g. typically 3)
119+ if len (g .Status .Attributes ) != int (* g .Spec .Replicas ) {
120+ return "" , false
121+ }
122+ return bestnode , true //"galera-0"
111123}
112124
113125// buildGcommURI builds a gcomm URI for a galera instance
@@ -230,18 +242,22 @@ func injectGcommURI(ctx context.Context, h *helper.Helper, config *rest.Config,
230242}
231243
232244// retrieveSequenceNumber probes a pod's galera instance for sequence number
233- func retrieveSequenceNumber (ctx context.Context , helper * helper.Helper , config * rest.Config , instance * mariadbv1.Galera , pod * corev1.Pod ) error {
234- err := mariadb .ExecInPod (ctx , helper , config , instance .Namespace , pod .Name , "galera" ,
245+ func retrieveSequenceNumber (ctx context.Context , helper * helper.Helper , config * rest.Config , instance * mariadbv1.Galera , pod * corev1.Pod ) (errStr []string , err error ) {
246+ errStr = nil
247+ err = mariadb .ExecInPod (ctx , helper , config , instance .Namespace , pod .Name , "galera" ,
235248 []string {"/bin/bash" , "/var/lib/operator-scripts/detect_last_commit.sh" },
236- func (stdout * bytes.Buffer , _ * bytes.Buffer ) error {
237- seqno := strings .TrimSuffix (stdout .String (), "\n " )
238- attr := mariadbv1.GaleraAttributes {
239- Seqno : seqno ,
249+ func (stdout * bytes.Buffer , stderr * bytes.Buffer ) error {
250+ var attr mariadbv1.GaleraAttributes
251+ if err := json .Unmarshal (stdout .Bytes (), & attr ); err != nil {
252+ return err
253+ }
254+ if stderr .Len () > 0 {
255+ errStr = strings .Split (strings .TrimSuffix (stderr .String (), "\n " ), "\n " )
240256 }
241257 instance .Status .Attributes [pod .Name ] = attr
242258 return nil
243259 })
244- return err
260+ return
245261}
246262
247263// clearPodAttributes clears information known by the operator about a pod
@@ -753,7 +769,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
753769 for _ , pod := range getReadyPods (podList .Items ) {
754770 name := pod .Name
755771 if _ , found := instance .Status .Attributes [name ]; found {
756- log .Info ("Galera started on " , "pod" , pod . Name )
772+ log .Info ("Galera started" , "pod" , name )
757773 clearPodAttributes (instance , name )
758774 }
759775 }
@@ -793,21 +809,36 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
793809 // . any other status means the the pod is starting/restarting. We can't
794810 // exec into the pod yet, so we will probe it in another reconcile loop.
795811 if ! instance .Status .Bootstrapped && ! isBootstrapInProgress (instance ) {
812+ var node string
813+ found := false
796814 for _ , pod := range getRunningPodsMissingAttributes (ctx , podList .Items , instance , helper , r .config ) {
797815 name := pod .Name
798816 util .LogForObject (helper , fmt .Sprintf ("Pod %s running, retrieve seqno" , name ), instance )
799- err := retrieveSequenceNumber (ctx , helper , r .config , instance , & pod )
817+ warn , err := retrieveSequenceNumber (ctx , helper , r .config , instance , & pod )
818+ if len (warn ) > 0 {
819+ util .LogForObject (helper , fmt .Sprintf ("Warning: %q" , warn ), instance )
820+ }
800821 if err != nil {
801- log .Error (err , "Failed to retrieve seqno for " , " name" , name )
822+ log .Error (err , fmt . Sprintf ( "Failed to retrieve seqno for %s " , name ) )
802823 return ctrl.Result {}, err
803824 }
804- log .Info ("" , "Pod" , name , "seqno:" , instance .Status .Attributes [name ].Seqno )
825+ log .Info (fmt .Sprintf ("Attributes retrieved for %s" , name ),
826+ "UUID" , instance .Status .Attributes [name ].UUID ,
827+ "Seqno" , instance .Status .Attributes [name ].Seqno ,
828+ "SafeToBootstrap" , instance .Status .Attributes [name ].SafeToBootstrap ,
829+ )
830+ if instance .Status .Attributes [name ].SafeToBootstrap {
831+ node = name
832+ found = true
833+ break
834+ }
805835 }
806836
807837 // Check if we have enough info to bootstrap the cluster now
808- if (len (instance .Status .Attributes ) > 0 ) &&
809- (len (instance .Status .Attributes ) == len (podList .Items )) {
810- node := findBestCandidate (& instance .Status )
838+ if ! found {
839+ node , found = findBestCandidate (instance )
840+ }
841+ if found {
811842 pod := getPodFromName (podList .Items , node )
812843 log .Info ("Pushing gcomm URI to bootstrap" , "pod" , node )
813844 // Setting the gcomm attribute marks this pod as 'currently bootstrapping the cluster'
0 commit comments