@@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
204204				log .Error (err , "unable to create Pod for Workspace" , "pod" , pod )
205205				return  ctrl.Result {Requeue : true }, err 
206206			} else  {
207- 				// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled, 
208- 				//			 need to be deleted and re-created 
209207				// Must increment and persist the pod starts, and ensure we retry on conflict. 
210208				// If we fail to persist this value, it's possible that the Pod gets recreated 
211209				// when the workspace stops, due to PodStarts still being 0 when the original Pod 
@@ -221,6 +219,34 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
221219				r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Creating" , "" )
222220			}
223221
222+ 		case  workspace .Status .Phase  ==  workspacev1 .WorkspacePhaseStopped  &&  workspace .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected ):
223+ 			if  workspace .Status .PodRecreated  >  r .Config .PodRecreationMaxRetries  {
224+ 				workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Pod reached maximum recreations %d, failing" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
225+ 				return  ctrl.Result {Requeue : true }, nil  // requeue so we end up in the "Stopped" case below 
226+ 			}
227+ 
228+ 			// Must persist the modification pod starts, and ensure we retry on conflict. 
229+ 			// If we fail to persist this value, it's possible that the Pod gets recreated endlessly 
230+ 			// when the workspace stops, due to PodStarts still being 0 when the original Pod 
231+ 			// disappears. 
232+ 			// Use a Patch instead of an Update, to prevent conflicts. 
233+ 			patch  :=  client .MergeFrom (workspace .DeepCopy ())
234+ 			workspace .Status .PodStarts  =  0 
235+ 			workspace .Status .PodRecreated ++ 
236+ 			workspace .Status .SetCondition (workspacev1 .NewWorkspaceConditionPodRejected (fmt .Sprintf ("Recreating pod... (%d retry)" , workspace .Status .PodRecreated ), metav1 .ConditionFalse ))
237+ 			if  err  :=  r .Status ().Patch (ctx , workspace , patch ); err  !=  nil  {
238+ 				log .Error (err , "Failed to patch PodStarts=0,PodRecreated++ in workspace status" )
239+ 				return  ctrl.Result {}, err 
240+ 			}
241+ 
242+ 			requeueAfter  :=  5  *  time .Second 
243+ 			if  r .Config .PodRecreationBackoff  !=  0  {
244+ 				requeueAfter  =  time .Duration (r .Config .PodRecreationBackoff )
245+ 			}
246+ 
247+ 			r .Recorder .Event (workspace , corev1 .EventTypeNormal , "Recreating" , "" )
248+ 			return  ctrl.Result {Requeue : true , RequeueAfter : requeueAfter }, nil 
249+ 
224250		case  workspace .Status .Phase  ==  workspacev1 .WorkspacePhaseStopped :
225251			if  err  :=  r .deleteWorkspaceSecrets (ctx , workspace ); err  !=  nil  {
226252				return  ctrl.Result {}, err 
@@ -403,7 +429,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
403429	isAborted  :=  ws .IsConditionTrue (workspacev1 .WorkspaceConditionAborted )
404430	// Also ignore workspaces that are requested to be stopped before they became ready. 
405431	isStoppedByRequest  :=  ws .IsConditionTrue (workspacev1 .WorkspaceConditionStoppedByRequest )
406- 	return  ! everReady  &&  ! isAborted  &&  ! isStoppedByRequest 
432+ 	// Also ignore pods that got rejected by the node 
433+ 	isPodRejected  :=  ws .IsConditionTrue (workspacev1 .WorkspaceConditionPodRejected )
434+ 	return  ! everReady  &&  ! isAborted  &&  ! isStoppedByRequest  &&  ! isPodRejected 
407435}
408436
409437func  (r  * WorkspaceReconciler ) emitPhaseEvents (ctx  context.Context , ws  * workspacev1.Workspace , old  * workspacev1.WorkspaceStatus ) {
0 commit comments