Skip to content

Commit 055f35e

Browse files
committed
[ws-daemon, ws-manager] Make sure to cleanup all workspace state for rejected pods
1 parent ac1c86b commit 055f35e

File tree

7 files changed

+137
-8
lines changed

7 files changed

+137
-8
lines changed

components/ws-daemon/pkg/controller/mock.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/ws-daemon/pkg/controller/workspace_controller.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,45 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
227227
span, ctx := opentracing.StartSpanFromContext(ctx, "handleWorkspaceStop")
228228
defer tracing.FinishSpan(span, &err)
229229

230+
if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
231+
setStateWipedCondition := func(s bool) {
232+
err := retry.RetryOnConflict(retryParams, func() error {
233+
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
234+
return err
235+
}
236+
237+
if s {
238+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
239+
} else {
240+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionFalse))
241+
}
242+
return wsc.Client.Status().Update(ctx, ws)
243+
})
244+
if err != nil {
245+
log.Error(err, "failed to set StateWiped condition")
246+
}
247+
}
248+
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
249+
log.Info("handling workspace stop - wiping mode")
250+
err = wsc.operations.WipeWorkspace(ctx, ws.Name)
251+
if err != nil {
252+
setStateWipedCondition(false)
253+
wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err))
254+
return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err)
255+
}
256+
257+
setStateWipedCondition(true)
258+
259+
return ctrl.Result{}, nil
260+
}
261+
262+
// regular case
263+
return wsc.doWorkspaceContentBackup(ctx, span, ws, req)
264+
}
265+
266+
func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, span opentracing.Span, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {
267+
log := log.FromContext(ctx)
268+
230269
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c == nil || c.Status == metav1.ConditionFalse {
231270
return ctrl.Result{}, fmt.Errorf("workspace content was never ready")
232271
}

components/ws-daemon/pkg/controller/workspace_operations.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ type WorkspaceOperations interface {
6666
BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error)
6767
// DeleteWorkspace deletes the content of the workspace from disk
6868
DeleteWorkspace(ctx context.Context, instanceID string) error
69+
// WipeWorkspace deletes all references to the workspace. Does not fail if parts are already gone, or state is incosistent.
70+
WipeWorkspace(ctx context.Context, instanceID string) error
6971
// SnapshotIDs generates the name and url for a snapshot
7072
SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error)
7173
// Snapshot takes a snapshot of the workspace
@@ -285,6 +287,36 @@ func (wso *DefaultWorkspaceOperations) DeleteWorkspace(ctx context.Context, inst
285287
return nil
286288
}
287289

290+
func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instanceID string) error {
291+
ws, err := wso.provider.GetAndConnect(ctx, instanceID)
292+
if err != nil {
293+
// we have to assume everything is fine, and this workspace has already been completely wiped
294+
return nil
295+
}
296+
297+
if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil {
298+
glog.WithError(err).WithFields(ws.OWI()).Error("cannot dispose session")
299+
return err
300+
}
301+
302+
// remove workspace daemon directory in the node
303+
if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {
304+
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory")
305+
return err
306+
}
307+
308+
// remove workspace daemon node directory in the node
309+
// TODO(gpl): Is this used at all? Can't find any reference
310+
if err := os.RemoveAll(ws.ServiceLocNode); err != nil {
311+
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory")
312+
return err
313+
}
314+
315+
wso.provider.Remove(ctx, instanceID)
316+
317+
return nil
318+
}
319+
288320
func (wso *DefaultWorkspaceOperations) SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) {
289321
sess, err := wso.provider.GetAndConnect(ctx, instanceID)
290322
if err != nil {

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,9 @@ const (
267267

268268
// WorkspaceConditionPodRejected is true if we detected that the pod was rejected by the node
269269
WorkspaceConditionPodRejected WorkspaceCondition = "PodRejected"
270+
271+
// WorkspaceConditionStateWiped is true once all state has successfully been wiped by ws-daemon. This is only set if PodRejected=true, and the rejected workspace has been deleted.
272+
WorkspaceConditionStateWiped WorkspaceCondition = "StateWiped"
270273
)
271274

272275
func NewWorkspaceConditionDeployed() metav1.Condition {
@@ -304,6 +307,15 @@ func NewWorkspaceConditionPodRejected(message string, status metav1.ConditionSta
304307
}
305308
}
306309

310+
func NewWorkspaceConditionStateWiped(message string, status metav1.ConditionStatus) metav1.Condition {
311+
return metav1.Condition{
312+
Type: string(WorkspaceConditionStateWiped),
313+
LastTransitionTime: metav1.Now(),
314+
Status: status,
315+
Message: message,
316+
}
317+
}
318+
307319
func NewWorkspaceConditionTimeout(message string) metav1.Condition {
308320
return metav1.Condition{
309321
Type: string(WorkspaceConditionTimeout),
@@ -498,6 +510,14 @@ func (w *Workspace) IsConditionTrue(condition WorkspaceCondition) bool {
498510
return wsk8s.ConditionPresentAndTrue(w.Status.Conditions, string(condition))
499511
}
500512

513+
func (w *Workspace) GetConditionState(condition WorkspaceCondition) (state metav1.ConditionStatus, ok bool) {
514+
cond := wsk8s.GetCondition(w.Status.Conditions, string(condition))
515+
if cond == nil {
516+
return "", false
517+
}
518+
return cond.Status, true
519+
}
520+
501521
// UpsertConditionOnStatusChange calls SetCondition if the condition does not exist or it's status or message has changed.
502522
func (w *Workspace) UpsertConditionOnStatusChange(newCondition metav1.Condition) {
503523
oldCondition := wsk8s.GetCondition(w.Status.Conditions, newCondition.Type)

components/ws-manager-mk2/controllers/status.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,15 @@ func (r *WorkspaceReconciler) checkNodeDisappeared(ctx context.Context, workspac
284284
}
285285

286286
func isDisposalFinished(ws *workspacev1.Workspace) bool {
287+
if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
288+
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionStateWiped)); c != nil {
289+
// If the condition is set, we are done with the disposal
290+
return true
291+
}
292+
// If the condition has not yet been set, we are not done, yet.
293+
return false
294+
}
295+
287296
return ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete) ||
288297
ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure) ||
289298
ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) ||
@@ -323,6 +332,17 @@ func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev
323332
return msg, nil
324333
}
325334

335+
// Check for state wiping failure.
336+
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionStateWiped)); c != nil && c.Status == metav1.ConditionFalse {
337+
msg := c.Message
338+
if msg == "" {
339+
msg = "Wiping workspace state failed for an unknown reason"
340+
} else {
341+
msg = fmt.Sprintf("Wiping workspace state failed: %s", msg)
342+
}
343+
return msg, nil
344+
}
345+
326346
status := pod.Status
327347
if status.Phase == corev1.PodFailed && (status.Reason != "" || status.Message != "") {
328348
// Don't force the phase to UNKNONWN here to leave a chance that we may detect the actual phase of

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
224224
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
225225
return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
226226
}
227+
log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation")
227228

228229
// Must persist the modification pod starts, and ensure we retry on conflict.
229230
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly

components/ws-manager-mk2/controllers/workspace_controller_test.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -461,14 +461,17 @@ var _ = Describe("WorkspaceController", func() {
461461
By("rejecting pod")
462462
rejectPod(pod)
463463

464-
// TODO(gpl): how to check for transient states like:
465-
// - pod deletion
466-
// - PodRejected condition
467-
// By("await pod deleted")
468-
// expectWorkspaceCleanup(ws, pod)
469-
// Eventually(func(g Gomega) {
470-
// g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, pod)).To(MatchError(ContainSubstring("not found")))
471-
// }, timeout, interval).Should(Succeed())
464+
By("await pod being in stopping")
465+
Eventually(func(g Gomega) {
466+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
467+
g.Expect(ws.Status.Phase).To(Equal(workspacev1.WorkspacePhaseStopping))
468+
}, timeout, interval).Should(Succeed())
469+
470+
// when a rejected workspace pod is in stopping, ws-daemon wipes the state before it's moved to "stopped"
471+
// mimic this ws-daemon behavior
472+
updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) {
473+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
474+
})
472475

473476
By("await pod recreation")
474477
Eventually(func(g Gomega) {

0 commit comments

Comments
 (0)