Skip to content

Commit db692ec

Browse files
committed
[ws-daemon, ws-manager] Fix podRecreationTimeout, and synchronization of state-wiping
1 parent 055f35e commit db692ec

File tree

5 files changed

+62
-16
lines changed

5 files changed

+62
-16
lines changed

components/ws-daemon/pkg/controller/workspace_controller.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,23 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
228228
defer tracing.FinishSpan(span, &err)
229229

230230
if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
231+
if ws.IsConditionPresent(workspacev1.WorkspaceConditionStateWiped) {
232+
// we are done here
233+
return ctrl.Result{}, nil
234+
}
235+
236+
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
237+
if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
238+
// Container is still running, we need to wait for it to stop.
239+
// We should get an event when the condition changes, but requeue
240+
// anyways to make sure we act on it in time.
241+
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
242+
}
243+
244+
if wsc.latestWorkspace(ctx, ws) != nil {
245+
return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil
246+
}
247+
231248
setStateWipedCondition := func(s bool) {
232249
err := retry.RetryOnConflict(retryParams, func() error {
233250
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
@@ -245,8 +262,8 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
245262
log.Error(err, "failed to set StateWiped condition")
246263
}
247264
}
248-
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
249265
log.Info("handling workspace stop - wiping mode")
266+
250267
err = wsc.operations.WipeWorkspace(ctx, ws.Name)
251268
if err != nil {
252269
setStateWipedCondition(false)
@@ -256,6 +273,7 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
256273

257274
setStateWipedCondition(true)
258275

276+
log.Info("handling workspace stop - wiping done.")
259277
return ctrl.Result{}, nil
260278
}
261279

components/ws-daemon/pkg/controller/workspace_operations.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -305,13 +305,6 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
305305
return err
306306
}
307307

308-
// remove workspace daemon node directory in the node
309-
// TODO(gpl): Is this used at all? Can't find any reference
310-
if err := os.RemoveAll(ws.ServiceLocNode); err != nil {
311-
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory")
312-
return err
313-
}
314-
315308
wso.provider.Remove(ctx, instanceID)
316309

317310
return nil

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,11 @@ func (w *Workspace) IsConditionTrue(condition WorkspaceCondition) bool {
510510
return wsk8s.ConditionPresentAndTrue(w.Status.Conditions, string(condition))
511511
}
512512

513+
func (w *Workspace) IsConditionPresent(condition WorkspaceCondition) bool {
514+
c := wsk8s.GetCondition(w.Status.Conditions, string(condition))
515+
return c != nil
516+
}
517+
513518
func (w *Workspace) GetConditionState(condition WorkspaceCondition) (state metav1.ConditionStatus, ok bool) {
514519
cond := wsk8s.GetCondition(w.Status.Conditions, string(condition))
515520
if cond == nil {

components/ws-manager-mk2/controllers/status.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ const (
4343

4444
// podRejectedReasonOutOfCPU is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient CPU available
4545
podRejectedReasonOutOfCPU = "OutOfcpu"
46+
47+
// podRejectedReasonOutOfMemory is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient memory available
48+
podRejectedReasonOutOfMemory = "OutOfmemory"
4649
)
4750

4851
func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace, pods *corev1.PodList, cfg *config.Configuration) (err error) {
@@ -129,9 +132,9 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
129132
workspace.Status.Phase = *phase
130133
}
131134

132-
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
135+
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
133136
// Check: A situation where we want to retry?
134-
if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == podRejectedReasonNodeAffinity || pod.Status.Reason == podRejectedReasonOutOfCPU) && strings.HasPrefix(pod.Status.Message, "Pod was rejected") {
137+
if isPodRejected(pod) {
135138
// This is a situation where we want to re-create the pod!
136139
log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure)
137140
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
@@ -490,3 +493,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
490493
func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
491494
return ws.ObjectMeta.DeletionTimestamp != nil
492495
}
496+
497+
// isPodRejected returns true if the pod has been rejected by the kubelet
498+
func isPodRejected(pod *corev1.Pod) bool {
499+
return pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == podRejectedReasonNodeAffinity || pod.Status.Reason == podRejectedReasonOutOfCPU || pod.Status.Reason == podRejectedReasonOutOfMemory) && strings.HasPrefix(pod.Status.Message, "Pod was rejected")
500+
}

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
138138
}
139139

140140
if !equality.Semantic.DeepDerivative(oldStatus, workspace.Status) {
141-
log.Info("updating workspace status", "status", workspace.Status, "podStatus", podStatus)
141+
log.Info("updating workspace status", "status", workspace.Status, "podStatus", podStatus, "pods", len(workspacePods.Items))
142142
}
143143

144144
err = r.Status().Update(ctx, &workspace)
@@ -181,6 +181,24 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
181181
// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
182182
switch {
183183
case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
184+
if workspace.Status.PodRecreated > 0 {
185+
// This is a re-creation: Make sure to wait at least for
186+
c := wsk8s.GetCondition(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionPodRejected))
187+
if c == nil {
188+
err = fmt.Errorf("failed to retrieve PodRejected condition")
189+
log.Error(err, "failed to trigger pod recreation")
190+
return ctrl.Result{}, err
191+
}
192+
193+
recreationTimeout := r.podRecreationTimeout()
194+
waitTime := time.Until(c.LastTransitionTime.Add(recreationTimeout))
195+
if waitTime > 0 {
196+
log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout")
197+
return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil
198+
}
199+
log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout")
200+
}
201+
184202
sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
185203
if err != nil {
186204
log.Error(err, "unable to create startWorkspace context")
@@ -250,12 +268,8 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
250268
// Reset metrics cache
251269
r.metrics.forgetWorkspace(workspace)
252270

253-
requeueAfter := 5 * time.Second
254-
if r.Config.PodRecreationBackoff != 0 {
255-
requeueAfter = time.Duration(r.Config.PodRecreationBackoff)
256-
}
257-
258271
r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
272+
requeueAfter := r.podRecreationTimeout()
259273
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
260274

261275
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
@@ -362,6 +376,14 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
362376
return ctrl.Result{}, nil
363377
}
364378

379+
func (r *WorkspaceReconciler) podRecreationTimeout() time.Duration {
380+
recreationTimeout := 5 * time.Second
381+
if r.Config.PodRecreationBackoff != 0 {
382+
recreationTimeout = time.Duration(r.Config.PodRecreationBackoff)
383+
}
384+
return recreationTimeout
385+
}
386+
365387
func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) {
366388
log := log.FromContext(ctx)
367389

0 commit comments

Comments
 (0)