@@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
204204 log .Error (err , "unable to create Pod for Workspace" , "pod" , pod )
205205 return ctrl.Result {Requeue : true }, err
206206 } else {
207- // TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
208- // need to be deleted and re-created
209207 // Must increment and persist the pod starts, and ensure we retry on conflict.
210208 // If we fail to persist this value, it's possible that the Pod gets recreated
211209 // when the workspace stops, due to PodStarts still being 0 when the original Pod
@@ -221,6 +219,34 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
221219 r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Creating" , "" )
222220 }
223221
222+ case workspace .Status .Phase == workspacev1 .WorkspacePhaseStopped && workspace .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected ):
223+ if workspace .Status .PodRecreated > r .Config .PodRecreationMaxRetries {
224+ workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Pod reached maximum recreations %d, failing" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
225+ return ctrl.Result {Requeue : true }, nil // requeue so we end up in the "Stopped" case below
226+ }
227+
228+ // Must persist the modification pod starts, and ensure we retry on conflict.
229+ // If we fail to persist this value, it's possible that the Pod gets recreated endlessly
230+ // when the workspace stops, due to PodStarts still being 0 when the original Pod
231+ // disappears.
232+ // Use a Patch instead of an Update, to prevent conflicts.
233+ patch := client .MergeFrom (workspace .DeepCopy ())
234+ workspace .Status .PodStarts = 0
235+ workspace .Status .PodRecreated ++
236+ workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Recreating pod... (%d retry)" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
237+ if err := r .Status ().Patch (ctx , workspace , patch ); err != nil {
238+ log .Error (err , "Failed to patch PodStarts=0,PodRecreated++ in workspace status" )
239+ return ctrl.Result {}, err
240+ }
241+
242+ requeueAfter := 5 * time .Second
243+ if r .Config .PodRecreationBackoff != 0 {
244+ requeueAfter = time .Duration (r .Config .PodRecreationBackoff )
245+ }
246+
247+ r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Recreating" , "" )
248+ return ctrl.Result {Requeue : true , RequeueAfter : requeueAfter }, nil
249+
224250 case workspace .Status .Phase == workspacev1 .WorkspacePhaseStopped :
225251 if err := r .deleteWorkspaceSecrets (ctx , workspace ); err != nil {
226252 return ctrl.Result {}, err
@@ -403,7 +429,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
403429 isAborted := ws .IsConditionTrue (workspacev1 .WorkspaceConditionAborted )
404430 // Also ignore workspaces that are requested to be stopped before they became ready.
405431 isStoppedByRequest := ws .IsConditionTrue (workspacev1 .WorkspaceConditionStoppedByRequest )
406- return ! everReady && ! isAborted && ! isStoppedByRequest
432+ // Also ignore pods that got rejected by the node
433+ isPodRejected := ws .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected )
434+ return ! everReady && ! isAborted && ! isStoppedByRequest && ! isPodRejected
407435}
408436
409437func (r * WorkspaceReconciler ) emitPhaseEvents (ctx context.Context , ws * workspacev1.Workspace , old * workspacev1.WorkspaceStatus ) {
0 commit comments