@@ -180,7 +180,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
180180 if len (workspacePods .Items ) == 0 {
181181 // if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
182182 switch {
183- case workspace .Status .PodStarts == 0 :
183+ case workspace .Status .PodStarts == 0 || workspace . Status . PodStarts - workspace . Status . PodRecreated < 1 :
184184 sctx , err := newStartWorkspaceContext (ctx , r .Config , workspace )
185185 if err != nil {
186186 log .Error (err , "unable to create startWorkspace context" )
@@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
204204 log .Error (err , "unable to create Pod for Workspace" , "pod" , pod )
205205 return ctrl.Result {Requeue : true }, err
206206 } else {
207- // TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
208- // need to be deleted and re-created
209207 // Must increment and persist the pod starts, and ensure we retry on conflict.
210208 // If we fail to persist this value, it's possible that the Pod gets recreated
211209 // when the workspace stops, due to PodStarts still being 0 when the original Pod
@@ -221,6 +219,43 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
221219 r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Creating" , "" )
222220 }
223221
222+ case workspace .Status .Phase == workspacev1 .WorkspacePhaseStopped && workspace .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected ):
223+ if workspace .Status .PodRecreated > r .Config .PodRecreationMaxRetries {
224+ workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Pod reached maximum recreations %d, failing" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
225+ return ctrl.Result {Requeue : true }, nil // requeue so we end up in the "Stopped" case below
226+ }
227+
228+ // Must persist the modification pod starts, and ensure we retry on conflict.
229+ // If we fail to persist this value, it's possible that the Pod gets recreated endlessly
230+ // when the workspace stops, due to PodStarts still being 0 when the original Pod
231+ // disappears.
232+ // Use a Patch instead of an Update, to prevent conflicts.
233+ patch := client .MergeFrom (workspace .DeepCopy ())
234+
235+ // Reset status
236+ sc := workspace .Status .DeepCopy ()
237+ workspace .Status = workspacev1.WorkspaceStatus {}
238+ workspace .Status .OwnerToken = sc .OwnerToken
239+ workspace .Status .PodStarts = sc .PodStarts
240+ workspace .Status .PodRecreated = sc .PodRecreated + 1
241+ workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Recreating pod... (%d retry)" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
242+
243+ if err := r .Status ().Patch (ctx , workspace , patch ); err != nil {
244+ log .Error (err , "Failed to patch workspace status-reset" )
245+ return ctrl.Result {}, err
246+ }
247+
248+ // Reset metrics cache
249+ r .metrics .forgetWorkspace (workspace )
250+
251+ requeueAfter := 5 * time .Second
252+ if r .Config .PodRecreationBackoff != 0 {
253+ requeueAfter = time .Duration (r .Config .PodRecreationBackoff )
254+ }
255+
256+ r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Recreating" , "" )
257+ return ctrl.Result {Requeue : true , RequeueAfter : requeueAfter }, nil
258+
224259 case workspace .Status .Phase == workspacev1 .WorkspacePhaseStopped :
225260 if err := r .deleteWorkspaceSecrets (ctx , workspace ); err != nil {
226261 return ctrl.Result {}, err
@@ -378,6 +413,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
378413 lastState .recordedStartTime = true
379414 }
380415
416+ if lastState .recordedRecreations < workspace .Status .PodRecreated {
417+ r .metrics .countWorkspaceRecreations (& log , workspace )
418+ lastState .recordedRecreations = workspace .Status .PodRecreated
419+ }
420+
381421 if workspace .Status .Phase == workspacev1 .WorkspacePhaseStopped {
382422 r .metrics .countWorkspaceStop (& log , workspace )
383423
@@ -403,7 +443,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
403443 isAborted := ws .IsConditionTrue (workspacev1 .WorkspaceConditionAborted )
404444 // Also ignore workspaces that are requested to be stopped before they became ready.
405445 isStoppedByRequest := ws .IsConditionTrue (workspacev1 .WorkspaceConditionStoppedByRequest )
406- return ! everReady && ! isAborted && ! isStoppedByRequest
446+ // Also ignore pods that got rejected by the node
447+ isPodRejected := ws .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected )
448+ return ! everReady && ! isAborted && ! isStoppedByRequest && ! isPodRejected
407449}
408450
409451func (r * WorkspaceReconciler ) emitPhaseEvents (ctx context.Context , ws * workspacev1.Workspace , old * workspacev1.WorkspaceStatus ) {
0 commit comments