Skip to content

Commit c895e84

Browse files
committed
[ws-manager] Re-create workspace pods (incl. test)
1 parent abb191f commit c895e84

File tree

8 files changed

+262
-7
lines changed

8 files changed

+262
-7
lines changed

components/ws-manager-api/go/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,11 @@ type Configuration struct {
142142

143143
// SSHGatewayCAPublicKey is a CA public key
144144
SSHGatewayCAPublicKey string
145+
146+
// PodRecreationMaxRetries
147+
PodRecreationMaxRetries int `json:"podRecreationMaxRetries,omitempty"`
148+
// PodRecreationBackoff
149+
PodRecreationBackoff util.Duration `json:"podRecreationBackoff,omitempty"`
145150
}
146151

147152
type WorkspaceClass struct {

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,10 @@ func (ps PortSpec) Equal(other PortSpec) bool {
170170

171171
// WorkspaceStatus defines the observed state of Workspace
172172
type WorkspaceStatus struct {
173-
PodStarts int `json:"podStarts"`
174-
URL string `json:"url,omitempty" scrub:"redact"`
175-
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
173+
PodStarts int `json:"podStarts"`
174+
PodRecreated int `json:"podRecreated"`
175+
URL string `json:"url,omitempty" scrub:"redact"`
176+
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
176177

177178
// +kubebuilder:default=Unknown
178179
Phase WorkspacePhase `json:"phase,omitempty"`
@@ -263,6 +264,9 @@ const (
263264
// WorkspaceContainerRunning is true if the workspace container is running.
264265
// Used to determine if a backup can be taken, only once the container is stopped.
265266
WorkspaceConditionContainerRunning WorkspaceCondition = "WorkspaceContainerRunning"
267+
268+
// WorkspaceConditionPodRejected is true if we detected that the pod was rejected by the node
269+
WorkspaceConditionPodRejected WorkspaceCondition = "PodRejected"
266270
)
267271

268272
func NewWorkspaceConditionDeployed() metav1.Condition {
@@ -291,6 +295,15 @@ func NewWorkspaceConditionFailed(message string) metav1.Condition {
291295
}
292296
}
293297

298+
func NewWorkspaceConditionPodRejected(message string, status metav1.ConditionStatus) metav1.Condition {
299+
return metav1.Condition{
300+
Type: string(WorkspaceConditionPodRejected),
301+
LastTransitionTime: metav1.Now(),
302+
Status: status,
303+
Message: message,
304+
}
305+
}
306+
294307
func NewWorkspaceConditionTimeout(message string) metav1.Condition {
295308
return metav1.Condition{
296309
Type: string(WorkspaceConditionTimeout),

components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,8 @@ spec:
543543
type: string
544544
podStarts:
545545
type: integer
546+
podRecreated:
547+
type: integer
546548
runtime:
547549
properties:
548550
hostIP:

components/ws-manager-mk2/controllers/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
3131
workspaceFailuresTotal string = "workspace_failure_total"
3232
workspaceStopsTotal string = "workspace_stops_total"
33+
workspaceRecreationsTotal string = "workspace_recreations_total"
3334
workspaceBackupsTotal string = "workspace_backups_total"
3435
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
3536
workspaceRestoresTotal string = "workspace_restores_total"
@@ -57,6 +58,7 @@ type controllerMetrics struct {
5758
totalStartsFailureCounterVec *prometheus.CounterVec
5859
totalFailuresCounterVec *prometheus.CounterVec
5960
totalStopsCounterVec *prometheus.CounterVec
61+
totalRecreationsCounterVec *prometheus.CounterVec
6062

6163
totalBackupCounterVec *prometheus.CounterVec
6264
totalBackupFailureCounterVec *prometheus.CounterVec
@@ -120,6 +122,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
120122
Name: workspaceStopsTotal,
121123
Help: "total number of workspaces stopped",
122124
}, []string{"reason", "type", "class"}),
125+
totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
126+
Namespace: metricsNamespace,
127+
Subsystem: metricsWorkspaceSubsystem,
128+
Name: workspaceRecreationsTotal,
129+
Help: "total number of workspace recreations",
130+
}, []string{"type", "class", "attempt"}),
123131

124132
totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
125133
Namespace: metricsNamespace,
@@ -233,6 +241,14 @@ func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1
233241
m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()
234242
}
235243

244+
func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {
245+
class := ws.Spec.Class
246+
tpe := string(ws.Spec.Type)
247+
attempt := fmt.Sprint(ws.Status.PodRecreated)
248+
249+
m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()
250+
}
251+
236252
func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {
237253
class := ws.Spec.Class
238254
tpe := string(ws.Spec.Type)
@@ -291,6 +307,7 @@ type metricState struct {
291307
recordedContentReady bool
292308
recordedBackupFailed bool
293309
recordedBackupCompleted bool
310+
recordedRecreations int
294311
}
295312

296313
func newMetricState(ws *workspacev1.Workspace) metricState {
@@ -306,6 +323,7 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
306323
recordedContentReady: ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),
307324
recordedBackupFailed: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),
308325
recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),
326+
recordedRecreations: ws.Status.PodRecreated,
309327
}
310328
}
311329

components/ws-manager-mk2/controllers/status.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,16 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
123123
workspace.Status.Phase = *phase
124124
}
125125

126+
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
127+
// Check: A situation where we want to retry?
128+
if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == "NodeAffinity" || pod.Status.Reason == "OutOfCPU") && strings.HasPrefix(pod.Status.Message, "Pod was rejected") {
129+
// This is a situation where we want to re-create the pod!
130+
log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure)
131+
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
132+
r.Recorder.Event(workspace, corev1.EventTypeWarning, "PodRejected", failure)
133+
}
134+
}
135+
126136
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
127137
// workspaces can fail only once - once there is a failed condition set, stick with it
128138
log.Info("workspace failed", "workspace", workspace.Name, "reason", failure)

components/ws-manager-mk2/controllers/subscriber_controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ func (r *SubscriberReconciler) Reconcile(ctx context.Context, req ctrl.Request)
6161
workspace.Status.Conditions = []metav1.Condition{}
6262
}
6363

64+
if workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
65+
// In this situation, we are about to re-create the pod. We don't want clients to see all the "stopping, stopped, starting" chatter, so we hide it here.
66+
// TODO(gpl) Is this a sane approach?
67+
return ctrl.Result{}, nil
68+
}
69+
6470
if r.OnReconcile != nil {
6571
r.OnReconcile(ctx, &workspace)
6672
}

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
180180
if len(workspacePods.Items) == 0 {
181181
// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
182182
switch {
183-
case workspace.Status.PodStarts == 0:
183+
case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
184184
sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
185185
if err != nil {
186186
log.Error(err, "unable to create startWorkspace context")
@@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
204204
log.Error(err, "unable to create Pod for Workspace", "pod", pod)
205205
return ctrl.Result{Requeue: true}, err
206206
} else {
207-
// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
208-
// need to be deleted and re-created
209207
// Must increment and persist the pod starts, and ensure we retry on conflict.
210208
// If we fail to persist this value, it's possible that the Pod gets recreated
211209
// when the workspace stops, due to PodStarts still being 0 when the original Pod
@@ -221,6 +219,43 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
221219
r.Recorder.Event(workspace, corev1.EventTypeNormal, "Creating", "")
222220
}
223221

222+
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected):
223+
if workspace.Status.PodRecreated > r.Config.PodRecreationMaxRetries {
224+
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
225+
return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
226+
}
227+
228+
// Must persist the modification pod starts, and ensure we retry on conflict.
229+
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
230+
// when the workspace stops, due to PodStarts still being 0 when the original Pod
231+
// disappears.
232+
// Use a Patch instead of an Update, to prevent conflicts.
233+
patch := client.MergeFrom(workspace.DeepCopy())
234+
235+
// Reset status
236+
sc := workspace.Status.DeepCopy()
237+
workspace.Status = workspacev1.WorkspaceStatus{}
238+
workspace.Status.OwnerToken = sc.OwnerToken
239+
workspace.Status.PodStarts = sc.PodStarts
240+
workspace.Status.PodRecreated = sc.PodRecreated + 1
241+
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Recreating pod... (%d retry)", workspace.Status.PodRecreated), metav1.ConditionFalse))
242+
243+
if err := r.Status().Patch(ctx, workspace, patch); err != nil {
244+
log.Error(err, "Failed to patch workspace status-reset")
245+
return ctrl.Result{}, err
246+
}
247+
248+
// Reset metrics cache
249+
r.metrics.forgetWorkspace(workspace)
250+
251+
requeueAfter := 5 * time.Second
252+
if r.Config.PodRecreationBackoff != 0 {
253+
requeueAfter = time.Duration(r.Config.PodRecreationBackoff)
254+
}
255+
256+
r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
257+
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
258+
224259
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
225260
if err := r.deleteWorkspaceSecrets(ctx, workspace); err != nil {
226261
return ctrl.Result{}, err
@@ -378,6 +413,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
378413
lastState.recordedStartTime = true
379414
}
380415

416+
if lastState.recordedRecreations < workspace.Status.PodRecreated {
417+
r.metrics.countWorkspaceRecreations(&log, workspace)
418+
lastState.recordedRecreations = workspace.Status.PodRecreated
419+
}
420+
381421
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
382422
r.metrics.countWorkspaceStop(&log, workspace)
383423

@@ -403,7 +443,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
403443
isAborted := ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted)
404444
// Also ignore workspaces that are requested to be stopped before they became ready.
405445
isStoppedByRequest := ws.IsConditionTrue(workspacev1.WorkspaceConditionStoppedByRequest)
406-
return !everReady && !isAborted && !isStoppedByRequest
446+
// Also ignore pods that got rejected by the node
447+
isPodRejected := ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected)
448+
return !everReady && !isAborted && !isStoppedByRequest && !isPodRejected
407449
}
408450

409451
func (r *WorkspaceReconciler) emitPhaseEvents(ctx context.Context, ws *workspacev1.Workspace, old *workspacev1.WorkspaceStatus) {

0 commit comments

Comments
 (0)