Skip to content

Commit 72f87e0

Browse files
committed
[ws-daemon, ws-manager] Review comments, logging cleanups and ordering fix
1 parent 68830be commit 72f87e0

File tree

14 files changed

+81
-84
lines changed

14 files changed

+81
-84
lines changed

components/ws-daemon/pkg/cgroup/cgroup.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package cgroup
66

77
import (
88
"context"
9+
"errors"
910

1011
"github.com/gitpod-io/gitpod/common-go/cgroups"
1112
"github.com/gitpod-io/gitpod/common-go/log"
@@ -81,7 +82,7 @@ func (host *PluginHost) WorkspaceAdded(ctx context.Context, ws *dispatch.Workspa
8182

8283
cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID)
8384
if err != nil {
84-
if err == context.Canceled {
85+
if errors.Is(err, context.Canceled) {
8586
return nil
8687
}
8788
return xerrors.Errorf("cannot get cgroup path for container %s: %w", ws.ContainerID, err)

components/ws-daemon/pkg/container/containerd.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,17 +437,18 @@ func (s *Containerd) WaitForContainerStop(ctx context.Context, workspaceInstance
437437
func (s *Containerd) DisposeContainer(ctx context.Context, workspaceInstanceID string) {
438438
log := log.WithContext(ctx)
439439

440-
log.Debug("CONTAINERD: DISPOSING CONTAINER")
441-
defer log.Debug("CONTAINERD: DISPOSING CONTAINER DONE")
440+
log.Debug("containerd: disposing container")
442441

443442
s.cond.L.Lock()
444443
defer s.cond.L.Unlock()
445444

446445
info, ok := s.wsiIdx[workspaceInstanceID]
447446
if !ok {
448447
// seems we are already done here
448+
log.Debug("containerd: disposing container skipped")
449449
return
450450
}
451+
defer log.Debug("containerd: disposing container done")
451452

452453
if info.ID != "" {
453454
err := s.Client.ContainerService().Delete(ctx, info.ID)

components/ws-daemon/pkg/content/hooks.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ func hookWipingTeardown() session.WorkspaceLivecycleHook {
176176

177177
if !ws.DoWipe {
178178
// this is the "default" case for 99% of all workspaces
179+
// TODO(gpl): We should probably make this the default for all workspaces - but not with this PR
179180
return nil
180181
}
181182

components/ws-daemon/pkg/controller/workspace_controller.go

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -258,62 +258,68 @@ func (wsc *WorkspaceController) handleWorkspaceRunning(ctx context.Context, ws *
258258
}
259259

260260
func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {
261-
log := log.FromContext(ctx)
262261
span, ctx := opentracing.StartSpanFromContext(ctx, "handleWorkspaceStop")
263262
defer tracing.FinishSpan(span, &err)
264263

265264
if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
265+
// edge case only exercised for rejected workspace pods
266266
if ws.IsConditionPresent(workspacev1.WorkspaceConditionStateWiped) {
267267
// we are done here
268268
return ctrl.Result{}, nil
269269
}
270270

271-
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
272-
if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
273-
// Container is still running, we need to wait for it to stop.
274-
// We should get an event when the condition changes, but requeue
275-
// anyways to make sure we act on it in time.
276-
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
277-
}
271+
return wsc.doWipeWorkspace(ctx, ws, req)
272+
}
278273

279-
if wsc.latestWorkspace(ctx, ws) != nil {
280-
return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil
281-
}
274+
// regular case
275+
return wsc.doWorkspaceContentBackup(ctx, span, ws, req)
276+
}
282277

283-
setStateWipedCondition := func(s bool) {
284-
err := retry.RetryOnConflict(retryParams, func() error {
285-
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
286-
return err
287-
}
278+
func (wsc *WorkspaceController) doWipeWorkspace(ctx context.Context, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {
279+
log := log.FromContext(ctx)
288280

289-
if s {
290-
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
291-
} else {
292-
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionFalse))
293-
}
294-
return wsc.Client.Status().Update(ctx, ws)
295-
})
296-
if err != nil {
297-
log.Error(err, "failed to set StateWiped condition")
281+
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
282+
if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
283+
// Container is still running, we need to wait for it to stop.
284+
// We should get an event when the condition changes, but requeue
285+
// anyways to make sure we act on it in time.
286+
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
287+
}
288+
289+
if wsc.latestWorkspace(ctx, ws) != nil {
290+
return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil
291+
}
292+
293+
setStateWipedCondition := func(success bool) {
294+
err := retry.RetryOnConflict(retryParams, func() error {
295+
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
296+
return err
298297
}
299-
}
300-
log.Info("handling workspace stop - wiping mode")
301298

302-
err = wsc.operations.WipeWorkspace(ctx, ws.Name)
299+
if success {
300+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
301+
} else {
302+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionFalse))
303+
}
304+
return wsc.Client.Status().Update(ctx, ws)
305+
})
303306
if err != nil {
304-
setStateWipedCondition(false)
305-
wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err))
306-
return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err)
307+
log.Error(err, "failed to set StateWiped condition")
307308
}
309+
}
310+
log.Info("handling workspace stop - wiping mode")
311+
defer log.Info("handling workspace stop - wiping done.")
308312

309-
setStateWipedCondition(true)
310-
311-
log.Info("handling workspace stop - wiping done.")
312-
return ctrl.Result{}, nil
313+
err = wsc.operations.WipeWorkspace(ctx, ws.Name)
314+
if err != nil {
315+
setStateWipedCondition(false)
316+
wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err))
317+
return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err)
313318
}
314319

315-
// regular case
316-
return wsc.doWorkspaceContentBackup(ctx, span, ws, req)
320+
setStateWipedCondition(true)
321+
322+
return ctrl.Result{}, nil
317323
}
318324

319325
func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, span opentracing.Span, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {

components/ws-daemon/pkg/controller/workspace_operations.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,9 +320,6 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
320320
wso.dispatch.DisposeWorkspace(ctx, instanceID)
321321

322322
// remove workspace daemon directory in the node
323-
log.Debug("DELETING WORKSPACE DAEMON DIR")
324-
defer log.Debug("DELETING WORKSPACE DAEMON DIR DONE")
325-
326323
removedChan := make(chan struct{}, 1)
327324
go func() {
328325
defer close(removedChan)
@@ -332,9 +329,11 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
332329
}
333330
}()
334331

335-
timeoutT := time.NewTicker(10 * time.Second)
332+
// We never want the "RemoveAll" to block the workspace from being delete, so we'll resort to make this a best-effort approach, and time out after 10s.
333+
timeout := time.NewTicker(10 * time.Second)
334+
defer timeout.Stop()
336335
select {
337-
case <-timeoutT.C:
336+
case <-timeout.C:
338337
case <-removedChan:
339338
log.Debug("successfully removed workspace daemon directory")
340339
}

components/ws-daemon/pkg/cpulimit/dispatch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ func (d *DispatchListener) WorkspaceAdded(ctx context.Context, ws *dispatch.Work
181181

182182
cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID)
183183
if err != nil {
184-
if dispatch.IsCancelled(ctx) {
184+
if errors.Is(err, context.Canceled) {
185185
return nil
186186
}
187187
return xerrors.Errorf("cannot start governer: %w", err)

components/ws-daemon/pkg/daemon/markunmount.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"bufio"
99
"bytes"
1010
"context"
11+
"errors"
1112
"io/ioutil"
1213
"path/filepath"
1314
"strings"
@@ -127,7 +128,7 @@ func (c *MarkUnmountFallback) WorkspaceUpdated(ctx context.Context, ws *dispatch
127128
}
128129

129130
err := unmountMark(ws.InstanceID)
130-
if err != nil && !dispatch.IsCancelled(ctx) {
131+
if err != nil && errors.Is(err, context.Canceled) {
131132
log.WithFields(ws.OWI()).WithError(err).Error("cannot unmount mark mount from within ws-daemon")
132133
c.activityCounter.WithLabelValues("false").Inc()
133134
} else {

components/ws-daemon/pkg/diskguard/guard.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ type Guard struct {
7171
// Start starts the disk guard
7272
func (g *Guard) Start() {
7373
t := time.NewTicker(g.Interval)
74+
defer t.Stop()
7475
for {
7576
bvail, err := getAvailableBytes(g.Path)
7677
if err != nil {

components/ws-daemon/pkg/dispatch/dispatch.go

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,16 @@ func GetFromContext(ctx context.Context) *Dispatch {
105105
return ctx.Value(contextDispatch).(*Dispatch)
106106
}
107107

108-
type dispacthHandlerWaitGroupKey struct{}
108+
type dispatchHandlerWaitGroupKey struct{}
109109

110110
var (
111-
contextDispatchWaitGroup = dispacthHandlerWaitGroupKey{}
111+
contextDispatchWaitGroup = dispatchHandlerWaitGroupKey{}
112112
)
113113

114114
func GetDispatchWaitGroup(ctx context.Context) *sync.WaitGroup {
115115
return ctx.Value(contextDispatchWaitGroup).(*sync.WaitGroup)
116116
}
117117

118-
func IsCancelled(ctx context.Context) bool {
119-
return context.Cause(ctx) != nil
120-
}
121-
122118
// Start starts the dispatch
123119
func (d *Dispatch) Start() error {
124120
ifac := informers.NewSharedInformerFactoryWithOptions(d.Kubernetes, podInformerResyncInterval, informers.WithNamespace(d.KubernetesNamespace))
@@ -195,8 +191,11 @@ func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) {
195191
d.mu.Lock()
196192
defer d.mu.Unlock()
197193

198-
log.WithField("instanceID", instanceID).Debug("WS DISPOSE")
199-
defer log.WithField("instanceID", instanceID).Debug("WS DISPOSE DONE")
194+
log.WithField("instanceID", instanceID).Debug("disposing workspace")
195+
defer log.WithField("instanceID", instanceID).Debug("disposing workspace done")
196+
197+
// Make the runtome drop all state it might still have about this workspace
198+
d.Runtime.DisposeContainer(ctx, instanceID)
200199

201200
// If we have that instanceID present, cancel it's context
202201
state, present := d.ctxs[instanceID]
@@ -210,9 +209,6 @@ func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) {
210209
// ...and wait for all long-running/async processes/go-routines to finish
211210
state.HandlerWaitGroup.Wait()
212211

213-
// Make the runtome drop all state it might still have about this workspace
214-
d.Runtime.DisposeContainer(ctx, instanceID)
215-
216212
// Mark as disposed, so we do not handle any further updates for it (except deletion)
217213
d.disposedCtxs[disposedKey(instanceID, state.Workspace.Pod)] = struct{}{}
218214

@@ -237,10 +233,9 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
237233
}
238234
disposedKey := disposedKey(workspaceInstanceID, newPod)
239235
if _, alreadyDisposed := d.disposedCtxs[disposedKey]; alreadyDisposed {
240-
log.WithField("disposedKey", disposedKey).Debug("DROPPING POD UPDATE FOR DISPOSED POD")
236+
log.WithField("disposedKey", disposedKey).Debug("dropping pod update for disposed pod")
241237
return
242238
}
243-
log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE: %s", workspaceInstanceID)
244239

245240
d.mu.Lock()
246241
defer d.mu.Unlock()
@@ -337,15 +332,15 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
337332
}
338333
}()
339334
}
340-
log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE DONE: %s", workspaceInstanceID)
341335
}
342336

343337
func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
344338
instanceID, ok := pod.Labels[wsk8s.WorkspaceIDLabel]
345339
if !ok {
346340
return
347341
}
348-
log.WithField("instanceID", instanceID).Debugf("POD DELETED: %s", instanceID)
342+
log.WithField("instanceID", instanceID).Debug("pod deleted")
343+
defer log.WithField("instanceID", instanceID).Debug("pod deleted done")
349344

350345
d.mu.Lock()
351346
defer d.mu.Unlock()
@@ -361,5 +356,4 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
361356

362357
delete(d.ctxs, instanceID)
363358

364-
log.WithField("instanceID", instanceID).Debugf("POD DELETED DONE: %s", instanceID)
365359
}

components/ws-daemon/pkg/iws/iws.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -961,9 +961,6 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear
961961
owi := wbs.Session.OWI()
962962
log := log.WithFields(owi)
963963

964-
log.Debug("TEARDOWN")
965-
defer log.Debug("TEARDOWN DONE")
966-
967964
var (
968965
success = true
969966
err error
@@ -981,8 +978,9 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear
981978
// WipingTeardown tears down every state we created using IWS
982979
func (wbs *InWorkspaceServiceServer) WipingTeardown(ctx context.Context, req *api.WipingTeardownRequest) (*api.WipingTeardownResponse, error) {
983980
log := log.WithFields(wbs.Session.OWI())
984-
log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN")
985-
defer log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN DONE")
981+
log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown")
982+
defer log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown done")
983+
986984
if !req.DoWipe {
987985
return &api.WipingTeardownResponse{Success: true}, nil
988986
}

0 commit comments

Comments
 (0)