Skip to content

Commit 9d10b88

Browse files
committed
WIP: Fix podRecreationTimeout + containerd cleanup
1 parent 7395836 commit 9d10b88

File tree

9 files changed

+89
-34
lines changed

9 files changed

+89
-34
lines changed

components/ws-daemon/pkg/container/container.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ type Runtime interface {
4747

4848
// IsContainerdReady returns is the status of containerd.
4949
IsContainerdReady(ctx context.Context) (bool, error)
50+
51+
// DisposeContainer removes a stopped container, and everything we know about it
52+
DisposeContainer(ctx context.Context, workspaceInstanceID string)
5053
}
5154

5255
var (

components/ws-daemon/pkg/container/containerd.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,21 @@ func (s *Containerd) WaitForContainerStop(ctx context.Context, workspaceInstance
427427
}
428428
}
429429

430+
func (s *Containerd) DisposeContainer(ctx context.Context, workspaceInstanceID string) {
431+
s.cond.L.Lock()
432+
defer s.cond.L.Unlock()
433+
434+
info, ok := s.wsiIdx[workspaceInstanceID]
435+
if !ok {
436+
// seems we are already done here
437+
return
438+
}
439+
440+
delete(s.wsiIdx, info.InstanceID)
441+
delete(s.podIdx, info.PodName)
442+
delete(s.cntIdx, info.ID)
443+
}
444+
430445
// ContainerExists finds out if a container with the given ID exists.
431446
func (s *Containerd) ContainerExists(ctx context.Context, id ID) (exists bool, err error) {
432447
_, err = s.Client.ContainerService().Get(ctx, string(id))

components/ws-daemon/pkg/controller/workspace_operations.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -304,15 +304,15 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
304304
return err
305305
}
306306

307+
// dispose all running "dispatch handlers", e.g. all code running on the "pod informer"-triggered part of ws-daemon
308+
wso.dispatch.DisposeWorkspace(ctx, instanceID)
309+
307310
// remove workspace daemon directory in the node
308311
if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {
309312
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory")
310313
return err
311314
}
312315

313-
// dispose all running "dispatch handlers", e.g. all code running on the "pod informer"-triggered part of ws-daemon
314-
wso.dispatch.DisposeWorkspace(ctx, instanceID)
315-
316316
// remove the reference from the WorkspaceProvider, e.g. the "workspace controller" part of ws-daemon
317317
wso.provider.Remove(ctx, instanceID)
318318

components/ws-daemon/pkg/dispatch/dispatch.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package dispatch
77
import (
88
"context"
99
"errors"
10+
"fmt"
1011
"sync"
1112
"time"
1213

@@ -59,7 +60,8 @@ func NewDispatch(runtime container.Runtime, kubernetes kubernetes.Interface, k8s
5960
Listener: listener,
6061
NodeName: nodename,
6162

62-
ctxs: make(map[string]*workspaceState),
63+
ctxs: make(map[string]*workspaceState),
64+
disposedCtxs: make(map[string]struct{}),
6365
}
6466

6567
return d, nil
@@ -76,9 +78,10 @@ type Dispatch struct {
7678

7779
Listener []Listener
7880

79-
stopchan chan struct{}
80-
ctxs map[string]*workspaceState
81-
mu sync.Mutex
81+
stopchan chan struct{}
82+
ctxs map[string]*workspaceState
83+
disposedCtxs map[string]struct{}
84+
mu sync.Mutex
8285
}
8386

8487
type workspaceState struct {
@@ -187,7 +190,7 @@ func (d *Dispatch) WorkspaceExistsOnNode(instanceID string) (ok bool) {
187190
return
188191
}
189192

190-
// DisposeWorkspace makes sure
193+
// DisposeWorkspace disposes the workspace incl. all running handler code for that pod
191194
func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) {
192195
d.mu.Lock()
193196
defer d.mu.Unlock()
@@ -206,10 +209,26 @@ func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) {
206209
// ...and wait for all long-running/async processes/go-routines to finish
207210
state.HandlerWaitGroup.Wait()
208211

212+
// Make sure the container is stopped
213+
err := d.Runtime.WaitForContainerStop(ctx, instanceID)
214+
if err != nil && !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
215+
log.WithError(err).WithFields(log.WorkspaceInstanceID(instanceID)).Error("unexpected waiting for container to stop")
216+
}
217+
218+
// Make the runtome drop all state it might still have about this workspace
219+
d.Runtime.DisposeContainer(ctx, instanceID)
220+
221+
// Mark as disposed, so we do not handle any further updates for it (except deletion)
222+
d.disposedCtxs[disposedKey(instanceID, state.Workspace.Pod)] = struct{}{}
223+
209224
delete(d.ctxs, instanceID)
210225
log.WithField("instanceID", instanceID).Debugf("WS DISPOSE DONE: %s", instanceID)
211226
}
212227

228+
func disposedKey(instanceID string, pod *corev1.Pod) string {
229+
return fmt.Sprintf("%s-%s", instanceID, pod.CreationTimestamp.String())
230+
}
231+
213232
func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
214233
workspaceID, ok := newPod.Labels[wsk8s.MetaIDLabel]
215234
if !ok {
@@ -222,6 +241,11 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
222241
if d.NodeName != "" && newPod.Spec.NodeName != d.NodeName {
223242
return
224243
}
244+
disposedKey := disposedKey(workspaceInstanceID, newPod)
245+
if _, alreadyDisposed := d.disposedCtxs[disposedKey]; alreadyDisposed {
246+
log.WithField("disposedKey", disposedKey).Debug("DROPPING POD UPDATE FOR DISPOSED POD")
247+
return
248+
}
225249
log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE: %s", workspaceInstanceID)
226250

227251
d.mu.Lock()
@@ -340,6 +364,8 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
340364
if state.Cancel != nil {
341365
state.Cancel()
342366
}
367+
343368
delete(d.ctxs, instanceID)
369+
344370
log.WithField("instanceID", instanceID).Debugf("POD DELETED DONE: %s", instanceID)
345371
}

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,10 +170,11 @@ func (ps PortSpec) Equal(other PortSpec) bool {
170170

171171
// WorkspaceStatus defines the observed state of Workspace
172172
type WorkspaceStatus struct {
173-
PodStarts int `json:"podStarts"`
174-
PodRecreated int `json:"podRecreated"`
175-
URL string `json:"url,omitempty" scrub:"redact"`
176-
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
173+
PodStarts int `json:"podStarts"`
174+
PodRecreated int `json:"podRecreated"`
175+
PodDeletionTime *metav1.Time `json:"podDeletionTime,omitempty"`
176+
URL string `json:"url,omitempty" scrub:"redact"`
177+
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
177178

178179
// +kubebuilder:default=Unknown
179180
Phase WorkspacePhase `json:"phase,omitempty"`

components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,9 @@ spec:
545545
type: integer
546546
podRecreated:
547547
type: integer
548+
podDeletionTime:
549+
format: date-time
550+
type: string
548551
runtime:
549552
properties:
550553
hostIP:

components/ws-manager-mk2/controllers/status.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"encoding/json"
1111
"fmt"
1212
"strings"
13+
"time"
1314

1415
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
1516
"github.com/gitpod-io/gitpod/common-go/tracing"
@@ -71,6 +72,14 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
7172
workspace.Status.Phase = workspacev1.WorkspacePhaseStopped
7273
}
7374

75+
log.WithValues("podDeletionTime", workspace.Status.PodDeletionTime).Info("PodDeletionTimeValue")
76+
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.Status.PodDeletionTime == nil {
77+
// Set the timestamp when we first saw the pod as deleted.
78+
// This is used for the delaying eventual pod restarts
79+
podDeletionTime := metav1.NewTime(time.Now())
80+
workspace.Status.PodDeletionTime = &podDeletionTime
81+
}
82+
7483
workspace.UpsertConditionOnStatusChange(workspacev1.NewWorkspaceConditionContainerRunning(metav1.ConditionFalse))
7584
return nil
7685
case 1:

components/ws-manager-mk2/controllers/suite_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,9 @@ func newTestConfig() config.Configuration {
159159
Name: "default",
160160
},
161161
},
162-
WorkspaceURLTemplate: "{{ .ID }}-{{ .Prefix }}-{{ .Host }}",
162+
WorkspaceURLTemplate: "{{ .ID }}-{{ .Prefix }}-{{ .Host }}",
163+
PodRecreationMaxRetries: 3,
164+
PodRecreationBackoff: util.Duration(500 * time.Millisecond),
163165
}
164166
}
165167

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -181,24 +181,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
181181
// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
182182
switch {
183183
case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
184-
if workspace.Status.PodRecreated > 0 {
185-
// This is a re-creation: Make sure to wait at least for
186-
c := wsk8s.GetCondition(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionPodRejected))
187-
if c == nil {
188-
err = fmt.Errorf("failed to retrieve PodRejected condition")
189-
log.Error(err, "failed to trigger pod recreation")
190-
return ctrl.Result{}, err
191-
}
192-
193-
recreationTimeout := r.podRecreationTimeout()
194-
waitTime := time.Until(c.LastTransitionTime.Add(recreationTimeout))
195-
if waitTime > 0 {
196-
log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout")
197-
return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil
198-
}
199-
log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout")
200-
}
201-
202184
sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
203185
if err != nil {
204186
log.Error(err, "unable to create startWorkspace context")
@@ -244,6 +226,21 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
244226
}
245227
log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation")
246228

229+
// Make sure to wait for "recreationTimeout" before creating the pod again
230+
if workspace.Status.PodDeletionTime == nil {
231+
log.Info("want to wait for pod recreation timeout, but podDeletionTime not set (yet)")
232+
return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil
233+
}
234+
235+
recreationTimeout := r.podRecreationTimeout()
236+
podDeletionTime := workspace.Status.PodDeletionTime.Time
237+
waitTime := time.Until(podDeletionTime.Add(recreationTimeout))
238+
if waitTime > 0 {
239+
log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout")
240+
return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil
241+
}
242+
log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout")
243+
247244
// Must persist the modification pod starts, and ensure we retry on conflict.
248245
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
249246
// when the workspace stops, due to PodStarts still being 0 when the original Pod
@@ -269,8 +266,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
269266
r.metrics.forgetWorkspace(workspace)
270267

271268
r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
272-
requeueAfter := r.podRecreationTimeout()
273-
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
269+
return ctrl.Result{Requeue: true}, nil
274270

275271
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
276272
if err := r.deleteWorkspaceSecrets(ctx, workspace); err != nil {
@@ -377,7 +373,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
377373
}
378374

379375
func (r *WorkspaceReconciler) podRecreationTimeout() time.Duration {
380-
recreationTimeout := 5 * time.Second
376+
recreationTimeout := 15 * time.Second // waiting less time creates issues with ws-daemon's pod-centric control loop ("Dispatch") if the workspace ends up on the same node again
381377
if r.Config.PodRecreationBackoff != 0 {
382378
recreationTimeout = time.Duration(r.Config.PodRecreationBackoff)
383379
}

0 commit comments

Comments
 (0)