Skip to content

Commit 925ee32

Browse files
Handle upgrade pod already exists (#73)
Fixes: #72
1 parent 14d9900 commit 925ee32

File tree

3 files changed

+201
-94
lines changed

3 files changed

+201
-94
lines changed

controllers/reconcile.go

Lines changed: 94 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
kerrors "k8s.io/apimachinery/pkg/util/errors"
2424
"k8s.io/apimachinery/pkg/util/sets"
2525
"k8s.io/apiserver/pkg/storage/names"
26+
"k8s.io/utils/ptr"
2627
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2728
"sigs.k8s.io/cluster-api/controllers/external"
2829
"sigs.k8s.io/cluster-api/util"
@@ -141,8 +142,12 @@ func (r *MicroK8sControlPlaneReconciler) reconcileMachines(ctx context.Context,
141142
var oldVersion, newVersion string
142143

143144
if numMachines > 0 {
145+
var err error
144146
sort.Sort(SortByCreationTimestamp(machines))
145-
oldVersion = semver.MajorMinor(*machines[0].Spec.Version)
147+
oldVersion, err = getOldestVersion(machines)
148+
if err != nil {
149+
return ctrl.Result{}, fmt.Errorf("failed to get oldest version: %w", err)
150+
}
146151
newVersion = semver.MajorMinor(mcp.Spec.Version)
147152
}
148153

@@ -202,50 +207,51 @@ func (r *MicroK8sControlPlaneReconciler) reconcileMachines(ctx context.Context,
202207

203208
// For each machine, get the node and upgrade it
204209
for _, machine := range machines {
210+
if isMachineUpgraded(machine, newVersion) {
211+
logger.Info("Machine already upgraded", "machine", machine.Name, "version", newVersion)
212+
continue
213+
}
214+
215+
if machine.Status.NodeRef == nil {
216+
logger.Info("Machine does not have a nodeRef yet, requeueing...", "machine", machine.Name)
217+
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
218+
}
205219

206220
// Get the node for the machine
207221
node, err := kubeclient.CoreV1().Nodes().Get(ctx, machine.Status.NodeRef.Name, metav1.GetOptions{})
208222
if err != nil {
209-
return ctrl.Result{RequeueAfter: 20 * time.Second}, err
223+
return ctrl.Result{}, fmt.Errorf("failed to get node: %w", err)
210224
}
211225

212226
logger.Info(fmt.Sprintf("Creating upgrade pod on %s...", node.Name))
213227
pod, err := createUpgradePod(ctx, kubeclient, node.Name, mcp.Spec.Version)
214228
if err != nil {
215-
logger.Error(err, "Error creating upgrade pod.")
229+
return ctrl.Result{}, fmt.Errorf("failed to create upgrade pod: %w", err)
216230
}
217231

218-
logger.Info("Waiting for upgrade node to be updated to the given version...")
219-
err = waitForNodeUpgrade(ctx, kubeclient, node.Name, mcp.Spec.Version)
220-
if err != nil {
221-
logger.Error(err, "Error waiting for node upgrade.")
232+
logger.Info("Waiting for node to be updated to the given version...", "node", node.Name)
233+
if err := waitForNodeUpgrade(ctx, kubeclient, node.Name, mcp.Spec.Version); err != nil {
234+
return ctrl.Result{}, fmt.Errorf("failed to wait for node upgrade: %w", err)
222235
}
223236

224-
time.Sleep(10 * time.Second)
225-
226-
// Get the current machine
237+
logger.Info("Node upgraded successfully.", "node", node.Name)
238+
// Update the machine version
227239
currentMachine := &clusterv1.Machine{}
228240
currentMachineName := node.Annotations["cluster.x-k8s.io/machine"]
229-
err = r.Client.Get(ctx, client.ObjectKey{Namespace: cluster.Namespace, Name: currentMachineName}, currentMachine)
230-
if err != nil {
231-
logger.Error(err, "Error getting machine.")
241+
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: cluster.Namespace, Name: currentMachineName}, currentMachine); err != nil {
242+
return ctrl.Result{}, fmt.Errorf("failed to get machine: %w", err)
232243
}
233244

234-
// Update the machine version
245+
logger.Info("Updating machine version...", "machine", currentMachine.Name)
235246
currentMachine.Spec.Version = &mcp.Spec.Version
236247
logger.Info(fmt.Sprintf("Now updating machine %s version to %s...", currentMachine.Name, *currentMachine.Spec.Version))
237-
err = r.Client.Update(ctx, currentMachine)
238-
if err != nil {
239-
logger.Error(err, "Could not update the machine version. We will retry.")
248+
if err := r.Client.Update(ctx, currentMachine); err != nil {
249+
return ctrl.Result{}, fmt.Errorf("failed to update machine: %w", err)
240250
}
241251

242-
time.Sleep(10 * time.Second)
243-
244-
// wait until pod is deleted
245252
logger.Info(fmt.Sprintf("Removing upgrade pod %s from %s...", pod.ObjectMeta.Name, node.Name))
246-
err = waitForPodDeletion(ctx, kubeclient, pod.ObjectMeta.Name)
247-
if err != nil {
248-
logger.Error(err, "Error waiting for pod deletion.")
253+
if err := waitForPodDeletion(ctx, kubeclient, pod.ObjectMeta.Name); err != nil {
254+
return ctrl.Result{}, fmt.Errorf("failed to wait for pod deletion: %w", err)
249255
}
250256

251257
logger.Info(fmt.Sprintf("Upgrade of node %s completed.\n", node.Name))
@@ -689,18 +695,25 @@ func (r *MicroK8sControlPlaneReconciler) removeNodeFromDqlite(ctx context.Contex
689695
return nil
690696
}
691697

698+
// createUpgradePod creates a pod that upgrades the node to the given version.
699+
// If the upgrade pod already exists, it is deleted and a new one will be created.
692700
func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeName string, nodeVersion string) (*corev1.Pod, error) {
693-
nodeVersion = strings.TrimPrefix(semver.MajorMinor(nodeVersion), "v")
701+
podName := "upgrade-pod"
694702

695-
uid := int64(0)
696-
priv := true
703+
// delete the pod if it exists
704+
if err := waitForPodDeletion(ctx, kubeclient, podName); err != nil {
705+
return nil, fmt.Errorf("failed to delete pod %s: %w", podName, err)
706+
}
707+
708+
nodeVersion = strings.TrimPrefix(semver.MajorMinor(nodeVersion), "v")
697709

698710
pod := &corev1.Pod{
699711
ObjectMeta: metav1.ObjectMeta{
700-
Name: "upgrade-pod",
712+
Name: podName,
701713
},
702714
Spec: corev1.PodSpec{
703-
NodeName: nodeName,
715+
NodeName: nodeName,
716+
RestartPolicy: corev1.RestartPolicyOnFailure,
704717
Containers: []corev1.Container{
705718
{
706719
Name: "upgrade",
@@ -709,7 +722,7 @@ func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeNam
709722
"su",
710723
"-c",
711724
},
712-
SecurityContext: &corev1.SecurityContext{Privileged: &priv, RunAsUser: &uid},
725+
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true), RunAsUser: ptr.To(int64(0))},
713726
Args: []string{
714727
fmt.Sprintf("curl -X POST -H \"Content-Type: application/json\" --unix-socket /run/snapd.socket -d '{\"action\": \"refresh\",\"channel\":\"%s/stable\"}' http://localhost/v2/snaps/microk8s", nodeVersion),
715728
},
@@ -736,47 +749,77 @@ func createUpgradePod(ctx context.Context, kubeclient *kubernetesClient, nodeNam
736749

737750
pod, err := kubeclient.CoreV1().Pods("default").Create(ctx, pod, metav1.CreateOptions{})
738751
if err != nil {
739-
return nil, err
752+
return nil, fmt.Errorf("failed to create pod %s: %w", podName, err)
740753
}
741754

742755
return pod, nil
743756
}
744757

745758
func waitForNodeUpgrade(ctx context.Context, kubeclient *kubernetesClient, nodeName, nodeVersion string) error {
746-
// attempt to connect 60 times. With a wait of 10 secs this should be 600 sec = 10 min
747-
attempts := 60
748-
for attempts > 0 {
759+
for attempts := 100; attempts > 0; attempts-- {
749760
node, err := kubeclient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
750761
if err != nil {
751-
return err
762+
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
752763
}
753764
currentVersion := semver.MajorMinor(node.Status.NodeInfo.KubeletVersion)
754765
nodeVersion = semver.MajorMinor(nodeVersion)
755766
if strings.HasPrefix(currentVersion, nodeVersion) {
756-
break
767+
return nil
757768
}
758-
time.Sleep(10 * time.Second)
759-
attempts--
769+
770+
time.Sleep(3 * time.Second)
760771
}
761-
return nil
772+
773+
return fmt.Errorf("timed out waiting for node %s to be upgraded to version %s", nodeName, nodeVersion)
762774
}
763775

776+
// waitForPodDeletion waits for the pod to be deleted. If the pod doesn't exist, it returns nil.
764777
func waitForPodDeletion(ctx context.Context, kubeclient *kubernetesClient, podName string) error {
765-
for {
766-
gracePeriod := int64(0)
778+
var err error
779+
for attempts := 5; attempts > 0; attempts-- {
767780
deleteOptions := metav1.DeleteOptions{
768-
GracePeriodSeconds: &gracePeriod,
781+
GracePeriodSeconds: ptr.To(int64(0)),
769782
}
770-
err := kubeclient.CoreV1().Pods("default").Delete(ctx, podName, deleteOptions)
771-
time.Sleep(10 * time.Second)
772-
if err != nil {
773-
if apierrors.IsNotFound(err) {
774-
break
775-
}
776-
return err
777-
} else {
778-
break
783+
err = kubeclient.CoreV1().Pods("default").Delete(ctx, podName, deleteOptions)
784+
if err == nil || apierrors.IsNotFound(err) {
785+
return nil
779786
}
787+
time.Sleep(3 * time.Second)
780788
}
781-
return nil
789+
790+
return fmt.Errorf("timed out waiting for pod %s to be deleted: %w", podName, err)
791+
}
792+
793+
// getOldestVersion returns the oldest version of the machines.
794+
func getOldestVersion(machines []clusterv1.Machine) (string, error) {
795+
var v string
796+
for _, m := range machines {
797+
if m.Spec.Version == nil {
798+
// weird!
799+
continue
800+
}
801+
802+
if v == "" {
803+
v = semver.MajorMinor(*m.Spec.Version)
804+
continue
805+
}
806+
807+
if semver.Compare(v, *m.Spec.Version) > 0 {
808+
v = semver.MajorMinor(*m.Spec.Version)
809+
}
810+
}
811+
812+
if v == "" {
813+
return "", fmt.Errorf("no version found")
814+
}
815+
return v, nil
816+
}
817+
818+
func isMachineUpgraded(m clusterv1.Machine, newVersion string) bool {
819+
if m.Spec.Version == nil {
820+
return false
821+
}
822+
machineVersion := semver.MajorMinor(*m.Spec.Version)
823+
newVersion = semver.MajorMinor(newVersion) // just being extra careful
824+
return semver.Compare(machineVersion, newVersion) == 0
782825
}

pkg/clusteragent/clusteragent.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,13 @@ func (c *Client) do(ctx context.Context, method, endpoint string, header http.He
147147

148148
// createPod creates a pod that runs a curl command.
149149
func (c *Client) createPod(ctx context.Context, method, endpoint string, header http.Header, data map[string]any) (*corev1.Pod, error) {
150+
podName := fmt.Sprintf(CallerPodNameFormat, c.nodeName)
151+
152+
// delete the pod if it exists
153+
if err := c.deletePod(ctx, podName); err != nil {
154+
return nil, fmt.Errorf("failed to delete pod: %w", err)
155+
}
156+
150157
curl, err := c.createCURLString(method, endpoint, header, data)
151158
if err != nil {
152159
return nil, fmt.Errorf("failed to create curl string: %w", err)
@@ -156,7 +163,7 @@ func (c *Client) createPod(ctx context.Context, method, endpoint string, header
156163

157164
pod := &corev1.Pod{
158165
ObjectMeta: metav1.ObjectMeta{
159-
Name: fmt.Sprintf(CallerPodNameFormat, c.nodeName),
166+
Name: podName,
160167
},
161168
Spec: corev1.PodSpec{
162169
NodeName: c.nodeName,
@@ -211,7 +218,7 @@ func (c *Client) createCURLString(method, endpoint string, header http.Header, d
211218
return req, nil
212219
}
213220

214-
// deletePod deletes a pod.
221+
// deletePod deletes a pod. It will succeed if the pod doesn't exist.
215222
func (c *Client) deletePod(ctx context.Context, podName string) error {
216223
deleteOptions := metav1.DeleteOptions{
217224
GracePeriodSeconds: ptr.To(int64(0)),

0 commit comments

Comments
 (0)