[ws-daemon, ws-manager] Fix podRecreationTimeout, and synchronization of state-wiping

geropl · geropl · commit db692ecb5f65 · 2024-10-29T09:33:59.000Z
diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go
@@ -228,6 +228,23 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
 	defer tracing.FinishSpan(span, &err)
 
 	if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
+		if ws.IsConditionPresent(workspacev1.WorkspaceConditionStateWiped) {
+			// we are done here
+			return ctrl.Result{}, nil
+		}
+
+		// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
+		if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
+			// Container is still running, we need to wait for it to stop.
+			// We should get an event when the condition changes, but requeue
+			// anyways to make sure we act on it in time.
+			return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
+		}
+
+		if wsc.latestWorkspace(ctx, ws) != nil {
+			return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil
+		}
+
 		setStateWipedCondition := func(s bool) {
 			err := retry.RetryOnConflict(retryParams, func() error {
 				if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
@@ -245,8 +262,8 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
 				log.Error(err, "failed to set StateWiped condition")
 			}
 		}
-		// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
 		log.Info("handling workspace stop - wiping mode")
+
 		err = wsc.operations.WipeWorkspace(ctx, ws.Name)
 		if err != nil {
 			setStateWipedCondition(false)
@@ -256,6 +273,7 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
 
 		setStateWipedCondition(true)
 
+		log.Info("handling workspace stop - wiping done.")
 		return ctrl.Result{}, nil
 	}
 
diff --git a/components/ws-daemon/pkg/controller/workspace_operations.go b/components/ws-daemon/pkg/controller/workspace_operations.go
@@ -305,13 +305,6 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
 		return err
 	}
 
-	// remove workspace daemon node directory in the node
-	// TODO(gpl): Is this used at all? Can't find any reference
-	if err := os.RemoveAll(ws.ServiceLocNode); err != nil {
-		glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory")
-		return err
-	}
-
 	wso.provider.Remove(ctx, instanceID)
 
 	return nil
diff --git a/components/ws-manager-api/go/crd/v1/workspace_types.go b/components/ws-manager-api/go/crd/v1/workspace_types.go
@@ -510,6 +510,11 @@ func (w *Workspace) IsConditionTrue(condition WorkspaceCondition) bool {
 	return wsk8s.ConditionPresentAndTrue(w.Status.Conditions, string(condition))
 }
 
+func (w *Workspace) IsConditionPresent(condition WorkspaceCondition) bool {
+	c := wsk8s.GetCondition(w.Status.Conditions, string(condition))
+	return c != nil
+}
+
 func (w *Workspace) GetConditionState(condition WorkspaceCondition) (state metav1.ConditionStatus, ok bool) {
 	cond := wsk8s.GetCondition(w.Status.Conditions, string(condition))
 	if cond == nil {
diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go
@@ -43,6 +43,9 @@ const (
 
 	// podRejectedReasonOutOfCPU is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient CPU available
 	podRejectedReasonOutOfCPU = "OutOfcpu"
+
+	// podRejectedReasonOutOfMemory is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient memory available
+	podRejectedReasonOutOfMemory = "OutOfmemory"
 )
 
 func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace, pods *corev1.PodList, cfg *config.Configuration) (err error) {
@@ -129,9 +132,9 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
 		workspace.Status.Phase = *phase
 	}
 
-	if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
+	if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
 		// Check: A situation where we want to retry?
-		if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == podRejectedReasonNodeAffinity || pod.Status.Reason == podRejectedReasonOutOfCPU) && strings.HasPrefix(pod.Status.Message, "Pod was rejected") {
+		if isPodRejected(pod) {
 			// This is a situation where we want to re-create the pod!
 			log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure)
 			workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
@@ -490,3 +493,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
 func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
 	return ws.ObjectMeta.DeletionTimestamp != nil
 }
+
+// isPodRejected returns true if the pod has been rejected by the kubelet
+func isPodRejected(pod *corev1.Pod) bool {
+	return pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == podRejectedReasonNodeAffinity || pod.Status.Reason == podRejectedReasonOutOfCPU || pod.Status.Reason == podRejectedReasonOutOfMemory) && strings.HasPrefix(pod.Status.Message, "Pod was rejected")
+}
diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go
@@ -138,7 +138,7 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	}
 
 	if !equality.Semantic.DeepDerivative(oldStatus, workspace.Status) {
-		log.Info("updating workspace status", "status", workspace.Status, "podStatus", podStatus)
+		log.Info("updating workspace status", "status", workspace.Status, "podStatus", podStatus, "pods", len(workspacePods.Items))
 	}
 
 	err = r.Status().Update(ctx, &workspace)
@@ -181,6 +181,24 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 		// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
 		switch {
 		case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
+			if workspace.Status.PodRecreated > 0 {
+				// This is a re-creation: Make sure to wait at least for
+				c := wsk8s.GetCondition(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionPodRejected))
+				if c == nil {
+					err = fmt.Errorf("failed to retrieve PodRejected condition")
+					log.Error(err, "failed to trigger pod recreation")
+					return ctrl.Result{}, err
+				}
+
+				recreationTimeout := r.podRecreationTimeout()
+				waitTime := time.Until(c.LastTransitionTime.Add(recreationTimeout))
+				if waitTime > 0 {
+					log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout")
+					return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil
+				}
+				log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout")
+			}
+
 			sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
 			if err != nil {
 				log.Error(err, "unable to create startWorkspace context")
@@ -250,12 +268,8 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 			// Reset metrics cache
 			r.metrics.forgetWorkspace(workspace)
 
-			requeueAfter := 5 * time.Second
-			if r.Config.PodRecreationBackoff != 0 {
-				requeueAfter = time.Duration(r.Config.PodRecreationBackoff)
-			}
-
 			r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
+			requeueAfter := r.podRecreationTimeout()
 			return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
 
 		case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
@@ -362,6 +376,14 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 	return ctrl.Result{}, nil
 }
 
+func (r *WorkspaceReconciler) podRecreationTimeout() time.Duration {
+	recreationTimeout := 5 * time.Second
+	if r.Config.PodRecreationBackoff != 0 {
+		recreationTimeout = time.Duration(r.Config.PodRecreationBackoff)
+	}
+	return recreationTimeout
+}
+
 func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) {
 	log := log.FromContext(ctx)