diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go index 5e5f33f1a42635..f2fecbb4953ce3 100644 --- a/components/ws-manager-mk2/controllers/status.go +++ b/components/ws-manager-mk2/controllers/status.go @@ -440,10 +440,12 @@ func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev if !ws.IsHeadless() { return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that", cs.Name), nil } - } else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode { + } else if !isPodBeingDeleted(pod) && terminationState.ExitCode == containerUnknownExitCode { + return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), nil + } else if !isPodBeingDeleted(pod) { // if a container is terminated and it wasn't because of either: // - regular shutdown - // - the exit code "UNKNOWN" (which might be caused by an intermittent issue and is handled in extractStatusFromPod) + // - the exit code "UNKNOWN" (which might be caused by an intermittent issue // - another known error // then we report it as UNKNOWN phase := workspacev1.WorkspacePhaseUnknown diff --git a/components/ws-manager-mk2/controllers/workspace_controller_test.go b/components/ws-manager-mk2/controllers/workspace_controller_test.go index a284c2224fa28c..665752a23fae8f 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller_test.go +++ b/components/ws-manager-mk2/controllers/workspace_controller_test.go @@ -218,6 +218,40 @@ var _ = Describe("WorkspaceController", func() { }) }) + It("should handle workspace failure with unknown exit code", func() { + ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) + pod := createWorkspaceExpectPod(ws) + + markReady(ws) + + // Update Pod with failed exit status. + updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) { + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: containerUnknownExitCode, + }, + }, + }) + }) + + // Controller should detect container exit and add Failed condition. + expectConditionEventually(ws, string(workspacev1.WorkspaceConditionFailed), metav1.ConditionTrue, "") + + expectFinalizerAndMarkBackupCompleted(ws, pod) + + expectWorkspaceCleanup(ws, pod) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + restores: 1, + startFailures: 0, + failures: 1, + stops: map[StopReason]int{StopReasonFailed: 1}, + backups: 1, + }) + }) + It("should clean up timed out workspaces", func() { ws := newWorkspace(uuid.NewString(), "default") m := collectMetricCounts(wsMetrics, ws)