@@ -23,6 +23,7 @@ import (
2323 kerrors "k8s.io/apimachinery/pkg/util/errors"
2424 "k8s.io/apimachinery/pkg/util/sets"
2525 "k8s.io/apiserver/pkg/storage/names"
26+ "k8s.io/utils/ptr"
2627 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2728 "sigs.k8s.io/cluster-api/controllers/external"
2829 "sigs.k8s.io/cluster-api/util"
@@ -141,8 +142,12 @@ func (r *MicroK8sControlPlaneReconciler) reconcileMachines(ctx context.Context,
141142 var oldVersion , newVersion string
142143
143144 if numMachines > 0 {
145+ var err error
144146 sort .Sort (SortByCreationTimestamp (machines ))
145- oldVersion = semver .MajorMinor (* machines [0 ].Spec .Version )
147+ oldVersion , err = getOldestVersion (machines )
148+ if err != nil {
149+ return ctrl.Result {}, fmt .Errorf ("failed to get oldest version: %w" , err )
150+ }
146151 newVersion = semver .MajorMinor (mcp .Spec .Version )
147152 }
148153
@@ -202,50 +207,51 @@ func (r *MicroK8sControlPlaneReconciler) reconcileMachines(ctx context.Context,
202207
203208 // For each machine, get the node and upgrade it
204209 for _ , machine := range machines {
210+ if isMachineUpgraded (machine , newVersion ) {
211+ logger .Info ("Machine already upgraded" , "machine" , machine .Name , "version" , newVersion )
212+ continue
213+ }
214+
215+ if machine .Status .NodeRef == nil {
216+ logger .Info ("Machine does not have a nodeRef yet, requeueing..." , "machine" , machine .Name )
217+ return ctrl.Result {RequeueAfter : 10 * time .Second }, nil
218+ }
205219
206220 // Get the node for the machine
207221 node , err := kubeclient .CoreV1 ().Nodes ().Get (ctx , machine .Status .NodeRef .Name , metav1.GetOptions {})
208222 if err != nil {
209- return ctrl.Result {RequeueAfter : 20 * time . Second } , err
223+ return ctrl.Result {}, fmt . Errorf ( "failed to get node: %w" , err )
210224 }
211225
212226 logger .Info (fmt .Sprintf ("Creating upgrade pod on %s..." , node .Name ))
213227 pod , err := createUpgradePod (ctx , kubeclient , node .Name , mcp .Spec .Version )
214228 if err != nil {
215- logger . Error ( err , "Error creating upgrade pod." )
229+ return ctrl. Result {}, fmt . Errorf ( "failed to create upgrade pod: %w" , err )
216230 }
217231
218- logger .Info ("Waiting for upgrade node to be updated to the given version..." )
219- err = waitForNodeUpgrade (ctx , kubeclient , node .Name , mcp .Spec .Version )
220- if err != nil {
221- logger .Error (err , "Error waiting for node upgrade." )
232+ logger .Info ("Waiting for node to be updated to the given version..." , "node" , node .Name )
233+ if err := waitForNodeUpgrade (ctx , kubeclient , node .Name , mcp .Spec .Version ); err != nil {
234+ return ctrl.Result {}, fmt .Errorf ("failed to wait for node upgrade: %w" , err )
222235 }
223236
224- time .Sleep (10 * time .Second )
225-
226- // Get the current machine
237+ logger .Info ("Node upgraded successfully." , "node" , node .Name )
238+ // Update the machine version
227239 currentMachine := & clusterv1.Machine {}
228240 currentMachineName := node .Annotations ["cluster.x-k8s.io/machine" ]
229- err = r .Client .Get (ctx , client.ObjectKey {Namespace : cluster .Namespace , Name : currentMachineName }, currentMachine )
230- if err != nil {
231- logger .Error (err , "Error getting machine." )
241+ if err := r .Client .Get (ctx , client.ObjectKey {Namespace : cluster .Namespace , Name : currentMachineName }, currentMachine ); err != nil {
242+ return ctrl.Result {}, fmt .Errorf ("failed to get machine: %w" , err )
232243 }
233244
234- // Update the machine version
245+ logger . Info ( "Updating machine version..." , " machine" , currentMachine . Name )
235246 currentMachine .Spec .Version = & mcp .Spec .Version
236247 logger .Info (fmt .Sprintf ("Now updating machine %s version to %s..." , currentMachine .Name , * currentMachine .Spec .Version ))
237- err = r .Client .Update (ctx , currentMachine )
238- if err != nil {
239- logger .Error (err , "Could not update the machine version. We will retry." )
248+ if err := r .Client .Update (ctx , currentMachine ); err != nil {
249+ return ctrl.Result {}, fmt .Errorf ("failed to update machine: %w" , err )
240250 }
241251
242- time .Sleep (10 * time .Second )
243-
244- // wait until pod is deleted
245252 logger .Info (fmt .Sprintf ("Removing upgrade pod %s from %s..." , pod .ObjectMeta .Name , node .Name ))
246- err = waitForPodDeletion (ctx , kubeclient , pod .ObjectMeta .Name )
247- if err != nil {
248- logger .Error (err , "Error waiting for pod deletion." )
253+ if err := waitForPodDeletion (ctx , kubeclient , pod .ObjectMeta .Name ); err != nil {
254+ return ctrl.Result {}, fmt .Errorf ("failed to wait for pod deletion: %w" , err )
249255 }
250256
251257 logger .Info (fmt .Sprintf ("Upgrade of node %s completed.\n " , node .Name ))
@@ -689,18 +695,25 @@ func (r *MicroK8sControlPlaneReconciler) removeNodeFromDqlite(ctx context.Contex
689695 return nil
690696}
691697
698+ // createUpgradePod creates a pod that upgrades the node to the given version.
699+ // If the upgrade pod already exists, it is deleted and a new one will be created.
692700func createUpgradePod (ctx context.Context , kubeclient * kubernetesClient , nodeName string , nodeVersion string ) (* corev1.Pod , error ) {
693- nodeVersion = strings . TrimPrefix ( semver . MajorMinor ( nodeVersion ), "v" )
701+ podName := "upgrade-pod"
694702
695- uid := int64 (0 )
696- priv := true
703+ // delete the pod if it exists
704+ if err := waitForPodDeletion (ctx , kubeclient , podName ); err != nil {
705+ return nil , fmt .Errorf ("failed to delete pod %s: %w" , podName , err )
706+ }
707+
708+ nodeVersion = strings .TrimPrefix (semver .MajorMinor (nodeVersion ), "v" )
697709
698710 pod := & corev1.Pod {
699711 ObjectMeta : metav1.ObjectMeta {
700- Name : "upgrade-pod" ,
712+ Name : podName ,
701713 },
702714 Spec : corev1.PodSpec {
703- NodeName : nodeName ,
715+ NodeName : nodeName ,
716+ RestartPolicy : corev1 .RestartPolicyOnFailure ,
704717 Containers : []corev1.Container {
705718 {
706719 Name : "upgrade" ,
@@ -709,7 +722,7 @@ func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeNam
709722 "su" ,
710723 "-c" ,
711724 },
712- SecurityContext : & corev1.SecurityContext {Privileged : & priv , RunAsUser : & uid },
725+ SecurityContext : & corev1.SecurityContext {Privileged : ptr . To ( true ) , RunAsUser : ptr . To ( int64 ( 0 )) },
713726 Args : []string {
714727 fmt .Sprintf ("curl -X POST -H \" Content-Type: application/json\" --unix-socket /run/snapd.socket -d '{\" action\" : \" refresh\" ,\" channel\" :\" %s/stable\" }' http://localhost/v2/snaps/microk8s" , nodeVersion ),
715728 },
@@ -736,47 +749,77 @@ func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeNam
736749
737750 pod , err := kubeclient .CoreV1 ().Pods ("default" ).Create (ctx , pod , metav1.CreateOptions {})
738751 if err != nil {
739- return nil , err
752+ return nil , fmt . Errorf ( "failed to create pod %s: %w" , podName , err )
740753 }
741754
742755 return pod , nil
743756}
744757
745758func waitForNodeUpgrade (ctx context.Context , kubeclient * kubernetesClient , nodeName , nodeVersion string ) error {
746- // attempt to connect 60 times. With a wait of 10 secs this should be 600 sec = 10 min
747- attempts := 60
748- for attempts > 0 {
759+ for attempts := 100 ; attempts > 0 ; attempts -- {
749760 node , err := kubeclient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
750761 if err != nil {
751- return err
762+ return fmt . Errorf ( "failed to get node %s: %w" , nodeName , err )
752763 }
753764 currentVersion := semver .MajorMinor (node .Status .NodeInfo .KubeletVersion )
754765 nodeVersion = semver .MajorMinor (nodeVersion )
755766 if strings .HasPrefix (currentVersion , nodeVersion ) {
756- break
767+ return nil
757768 }
758- time . Sleep ( 10 * time . Second )
759- attempts --
769+
770+ time . Sleep ( 3 * time . Second )
760771 }
761- return nil
772+
773+ return fmt .Errorf ("timed out waiting for node %s to be upgraded to version %s" , nodeName , nodeVersion )
762774}
763775
776+ // waitForPodDeletion waits for the pod to be deleted. If the pod doesn't exist, it returns nil.
764777func waitForPodDeletion (ctx context.Context , kubeclient * kubernetesClient , podName string ) error {
765- for {
766- gracePeriod := int64 ( 0 )
778+ var err error
779+ for attempts := 5 ; attempts > 0 ; attempts -- {
767780 deleteOptions := metav1.DeleteOptions {
768- GracePeriodSeconds : & gracePeriod ,
781+ GracePeriodSeconds : ptr . To ( int64 ( 0 )) ,
769782 }
770- err := kubeclient .CoreV1 ().Pods ("default" ).Delete (ctx , podName , deleteOptions )
771- time .Sleep (10 * time .Second )
772- if err != nil {
773- if apierrors .IsNotFound (err ) {
774- break
775- }
776- return err
777- } else {
778- break
783+ err = kubeclient .CoreV1 ().Pods ("default" ).Delete (ctx , podName , deleteOptions )
784+ if err == nil || apierrors .IsNotFound (err ) {
785+ return nil
779786 }
787+ time .Sleep (3 * time .Second )
780788 }
781- return nil
789+
790+ return fmt .Errorf ("timed out waiting for pod %s to be deleted: %w" , podName , err )
791+ }
792+
793+ // getOldestVersion returns the oldest version of the machines.
794+ func getOldestVersion (machines []clusterv1.Machine ) (string , error ) {
795+ var v string
796+ for _ , m := range machines {
797+ if m .Spec .Version == nil {
798+ // weird!
799+ continue
800+ }
801+
802+ if v == "" {
803+ v = semver .MajorMinor (* m .Spec .Version )
804+ continue
805+ }
806+
807+ if semver .Compare (v , * m .Spec .Version ) > 0 {
808+ v = semver .MajorMinor (* m .Spec .Version )
809+ }
810+ }
811+
812+ if v == "" {
813+ return "" , fmt .Errorf ("no version found" )
814+ }
815+ return v , nil
816+ }
817+
818+ func isMachineUpgraded (m clusterv1.Machine , newVersion string ) bool {
819+ if m .Spec .Version == nil {
820+ return false
821+ }
822+ machineVersion := semver .MajorMinor (* m .Spec .Version )
823+ newVersion = semver .MajorMinor (newVersion ) // just being extra careful
824+ return semver .Compare (machineVersion , newVersion ) == 0
782825}
0 commit comments