Skip to content

Commit 002f741

Browse files
committed
WIP maxRetries test
1 parent 88c40a7 commit 002f741

File tree

2 files changed

+150
-9
lines changed

2 files changed

+150
-9
lines changed

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -242,13 +242,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
242242
}
243243
log.Info("trigger pod recreation")
244244

245-
// Must persist the modification pod starts, and ensure we retry on conflict.
246-
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
247-
// when the workspace stops, due to PodStarts still being 0 when the original Pod
248-
// disappears.
249-
// Use a Patch instead of an Update, to prevent conflicts.
250-
patch := client.MergeFrom(workspace.DeepCopy())
251-
252245
// Reset status
253246
sc := workspace.Status.DeepCopy()
254247
workspace.Status = workspacev1.WorkspaceStatus{}
@@ -258,8 +251,8 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
258251
workspace.Status.PodRecreated = sc.PodRecreated + 1
259252
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Recreating pod... (%d retry)", workspace.Status.PodRecreated), metav1.ConditionFalse))
260253

261-
if err := r.Status().Patch(ctx, workspace, patch); err != nil {
262-
log.Error(err, "Failed to patch workspace status-reset")
254+
if err := r.Status().Update(ctx, workspace); err != nil {
255+
log.Error(err, "Failed to update workspace status-reset")
263256
return ctrl.Result{}, err
264257
}
265258

components/ws-manager-mk2/controllers/workspace_controller_test.go

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,154 @@ var _ = Describe("WorkspaceController", func() {
538538
expectPhaseTransitions(su, []workspacev1.WorkspacePhase{workspacev1.WorkspacePhasePending, workspacev1.WorkspacePhaseCreating, workspacev1.WorkspacePhaseInitializing, workspacev1.WorkspacePhaseRunning})
539539
// ### validate end
540540
})
541+
542+
It("pod recreation is only tried until limit", func() {
543+
ws := newWorkspace(uuid.NewString(), "default")
544+
m := collectMetricCounts(wsMetrics, ws)
545+
su := collectSubscriberUpdates()
546+
547+
// ### prepare block start
548+
By("creating workspace")
549+
// Simulate pod getting scheduled to a node.
550+
var node corev1.Node
551+
node.Name = uuid.NewString()
552+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
553+
// Manually create the workspace pod with the node name.
554+
// We can't update the pod with the node name, as this operation
555+
// is only allowed for the scheduler. So as a hack, we manually
556+
// create the workspace's pod.
557+
pod := &corev1.Pod{
558+
ObjectMeta: metav1.ObjectMeta{
559+
Name: fmt.Sprintf("ws-%s", ws.Name),
560+
Namespace: ws.Namespace,
561+
Finalizers: []string{workspacev1.GitpodFinalizerName},
562+
Labels: map[string]string{
563+
wsk8s.WorkspaceManagedByLabel: constants.ManagedBy,
564+
},
565+
},
566+
Spec: corev1.PodSpec{
567+
NodeName: node.Name,
568+
Containers: []corev1.Container{{
569+
Name: "workspace",
570+
Image: "someimage",
571+
}},
572+
},
573+
}
574+
575+
Expect(k8sClient.Create(ctx, pod)).To(Succeed())
576+
pod = createWorkspaceExpectPod(ws)
577+
updateObjWithRetries(k8sClient, pod, false, func(pod *corev1.Pod) {
578+
Expect(ctrl.SetControllerReference(ws, pod, k8sClient.Scheme())).To(Succeed())
579+
})
580+
581+
podStarts := 1
582+
podRecreated := 0
583+
// mimic the regular "start" phase
584+
updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) {
585+
ws.Status.PodStarts = podStarts
586+
ws.Status.PodRecreated = podRecreated
587+
})
588+
589+
startAndRejectWorkspace := func() {
590+
591+
// Wait until controller has reconciled at least once (by waiting for the runtime status to get updated).
592+
// This is necessary for the metrics to get recorded correctly. If we don't wait, the first reconciliation
593+
// might be once the Pod is already in a running state, and hence the metric state might not record e.g. content
594+
// restore.
595+
// This is only necessary because we manually created the pod, normally the Pod creation is the controller's
596+
// first reconciliation which ensures the metrics are recorded from the workspace's initial state.
597+
598+
Eventually(func(g Gomega) {
599+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
600+
g.Expect(ws.Status.Runtime).ToNot(BeNil())
601+
g.Expect(ws.Status.Runtime.PodName).To(Equal(pod.Name))
602+
}, timeout, interval).Should(Succeed())
603+
604+
// Await "deployed" condition, and check we are good
605+
expectConditionEventually(ws, string(workspacev1.WorkspaceConditionDeployed), metav1.ConditionTrue, "")
606+
Eventually(func(g Gomega) {
607+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
608+
g.Expect(ws.Status.PodStarts).To(Equal(podStarts))
609+
g.Expect(ws.Status.PodRecreated).To(Equal(podRecreated))
610+
}, timeout, interval).Should(Succeed())
611+
612+
// ### prepare block end
613+
614+
// ### trigger block start
615+
// Make pod be rejected 🪄
616+
By("rejecting pod")
617+
rejectPod(pod)
618+
619+
By("await pod being in stopping")
620+
Eventually(func(g Gomega) {
621+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
622+
g.Expect(ws.Status.Phase).To(Equal(workspacev1.WorkspacePhaseStopping))
623+
}, timeout, interval).Should(Succeed())
624+
625+
// when a rejected workspace pod is in stopping, ws-daemon wipes the state before it's moved to "stopped"
626+
// mimic this ws-daemon behavior
627+
updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) {
628+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
629+
})
630+
631+
podStarts++
632+
podRecreated++
633+
634+
By("await pod recreation")
635+
Eventually(func(g Gomega) {
636+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
637+
g.Expect(ws.Status.PodRecreated).To(Equal(podRecreated))
638+
g.Expect(ws.Status.Phase).To(Equal(workspacev1.WorkspacePhasePending))
639+
}, timeout, interval).Should(Succeed())
640+
// ### trigger block end
641+
642+
// ### retry block start
643+
// Transition Pod to pending, and expect workspace to reach Creating phase.
644+
// This should also cause create time metrics to be recorded.
645+
updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) {
646+
pod.Status.Phase = corev1.PodPending
647+
pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
648+
State: corev1.ContainerState{
649+
Waiting: &corev1.ContainerStateWaiting{
650+
Reason: "ContainerCreating",
651+
},
652+
},
653+
Name: "workspace",
654+
}}
655+
})
656+
657+
expectPhaseEventually(ws, workspacev1.WorkspacePhaseCreating)
658+
// ### retry block end
659+
}
660+
661+
for j := 0; j < 4; j++ {
662+
startAndRejectWorkspace()
663+
}
664+
665+
expectWorkspaceCleanup(ws, pod)
666+
667+
// ### validate result
668+
Eventually(func(g Gomega) {
669+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
670+
g.Expect(ws.Status.PodStarts).To(Equal(4))
671+
g.Expect(ws.Status.PodRecreated).To(Equal(3))
672+
}, timeout, interval).Should(Succeed())
673+
expectConditionEventually(ws, string(workspacev1.WorkspaceConditionFailed), metav1.ConditionTrue, "")
674+
675+
expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{
676+
restores: 1,
677+
backups: 0,
678+
backupFailures: 0,
679+
failures: 1,
680+
creatingCounts: 1,
681+
stops: map[StopReason]int{StopReasonStartFailure: 1},
682+
starts: 1, // this is NOT PodStarts, but merely an artifact of how we count it in the tests
683+
recreations: map[int]int{1: 1},
684+
})
685+
686+
expectPhaseTransitions(su, []workspacev1.WorkspacePhase{workspacev1.WorkspacePhasePending, workspacev1.WorkspacePhaseCreating, workspacev1.WorkspacePhaseInitializing, workspacev1.WorkspacePhaseStopping, workspacev1.WorkspacePhaseStopped})
687+
// ### validate end
688+
})
541689
})
542690

543691
Context("with headless workspaces", func() {

0 commit comments

Comments
 (0)