diff --git a/components/workspacekit/cmd/rings.go b/components/workspacekit/cmd/rings.go index 14c2c831523df2..148d0eff44e001 100644 --- a/components/workspacekit/cmd/rings.go +++ b/components/workspacekit/cmd/rings.go @@ -151,7 +151,7 @@ var ring0Cmd = &cobra.Command{ _ = cmd.Process.Signal(unix.SIGTERM) time.Sleep(ring1ShutdownTimeout) - if cmd.Process == nil { + if cmd.Process == nil || cmd.ProcessState.Exited() { return } diff --git a/components/ws-daemon/pkg/container/container.go b/components/ws-daemon/pkg/container/container.go index 888a6e08c37271..233bcc65106b34 100644 --- a/components/ws-daemon/pkg/container/container.go +++ b/components/ws-daemon/pkg/container/container.go @@ -9,6 +9,7 @@ import ( "golang.org/x/xerrors" + "github.com/containerd/containerd/api/types/task" workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" ) @@ -54,6 +55,10 @@ type Runtime interface { // DisposeContainer removes a stopped container, and everything we know about it DisposeContainer(ctx context.Context, workspaceInstanceID string) + + GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error) + + ForceKillContainerTask(ctx context.Context, id ID) error } var ( diff --git a/components/ws-daemon/pkg/container/containerd.go b/components/ws-daemon/pkg/container/containerd.go index 4af0dab74043b5..b40c4fc03dd28f 100644 --- a/components/ws-daemon/pkg/container/containerd.go +++ b/components/ws-daemon/pkg/container/containerd.go @@ -20,6 +20,7 @@ import ( "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/services/tasks/v1" "github.com/containerd/containerd/api/types" + "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/containers" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/images" @@ -576,6 +577,28 @@ func (s *Containerd) IsContainerdReady(ctx context.Context) (bool, error) { return true, nil } +func (s *Containerd) GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error) { + task, err := s.Client.TaskService().Get(ctx, &tasks.GetRequest{ + ContainerID: string(id), + }) + if err != nil { + return nil, err + } + if task.Process == nil { + return nil, fmt.Errorf("task has no process") + } + return task.Process, nil +} + +func (s *Containerd) ForceKillContainerTask(ctx context.Context, id ID) error { + _, err := s.Client.TaskService().Kill(ctx, &tasks.KillRequest{ + ContainerID: string(id), + Signal: 9, + All: true, + }) + return err +} + var kubepodsQoSRegexp = regexp.MustCompile(`([^/]+)-([^/]+)-pod`) var kubepodsRegexp = regexp.MustCompile(`([^/]+)-pod`) diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go index 5f199ca6b3487d..e31d8fda06cc8d 100644 --- a/components/ws-daemon/pkg/controller/workspace_controller.go +++ b/components/ws-daemon/pkg/controller/workspace_controller.go @@ -20,6 +20,7 @@ import ( workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" "github.com/opentracing/opentracing-go" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" corev1 "k8s.io/api/core/v1" @@ -349,9 +350,43 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) { // Container is still running, we need to wait for it to stop. - // We should get an event when the condition changes, but requeue - // anyways to make sure we act on it in time. - return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil + // We will wait for this situation for up to 5 minutes. + // If the container is still in a running state after that, + // there may be an issue with state synchronization. + // We should start backup anyway to avoid data loss. + if !(ws.Status.PodStoppingTime != nil && time.Since(ws.Status.PodStoppingTime.Time) > 5*time.Minute) { + // We should get an event when the condition changes, but requeue + // anyways to make sure we act on it in time. + return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil + } + + if !ws.IsConditionTrue(workspacev1.WorkspaceConditionForceKilledTask) { + err = wsc.forceKillContainerTask(ctx, ws) + if err != nil { + glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to force kill task: %v", err) + } + err = retry.RetryOnConflict(retryParams, func() error { + if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil { + return err + } + ws.Status.SetCondition(workspacev1.NewWorkspaceConditionForceKilledTask()) + return wsc.Client.Status().Update(ctx, ws) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set force killed task condition: %w", err) + } + return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil + } + + if time.Since(wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionForceKilledTask)).LastTransitionTime.Time) < 2*time.Second { + return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil + } + + glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Warn("workspace container is still running after 5 minutes of deletion, starting backup anyway") + err = wsc.dumpWorkspaceContainerInfo(ctx, ws) + if err != nil { + glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to dump container info: %v", err) + } } if wsc.latestWorkspace(ctx, ws) != nil { @@ -442,6 +477,33 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp return ctrl.Result{}, nil } +func (wsc *WorkspaceController) dumpWorkspaceContainerInfo(ctx context.Context, ws *workspacev1.Workspace) error { + id, err := wsc.runtime.WaitForContainer(ctx, ws.Name) + if err != nil { + return fmt.Errorf("failed to wait for container: %w", err) + } + task, err := wsc.runtime.GetContainerTaskInfo(ctx, id) + if err != nil { + return fmt.Errorf("failed to get container task info: %w", err) + } + glog.WithFields(ws.OWI()).WithFields(logrus.Fields{ + "containerID": id, + "exitStatus": task.ExitStatus, + "pid": task.Pid, + "exitedAt": task.ExitedAt.AsTime(), + "status": task.Status.String(), + }).Info("container task info") + return nil +} + +func (wsc *WorkspaceController) forceKillContainerTask(ctx context.Context, ws *workspacev1.Workspace) error { + id, err := wsc.runtime.WaitForContainer(ctx, ws.Name) + if err != nil { + return fmt.Errorf("failed to wait for container: %w", err) + } + return wsc.runtime.ForceKillContainerTask(ctx, id) +} + func (wsc *WorkspaceController) prepareInitializer(ctx context.Context, ws *workspacev1.Workspace) (*csapi.WorkspaceInitializer, error) { var init csapi.WorkspaceInitializer err := proto.Unmarshal(ws.Spec.Initializer, &init) diff --git a/components/ws-manager-api/go/crd/v1/workspace_types.go b/components/ws-manager-api/go/crd/v1/workspace_types.go index f16e75810139c1..e95a52b528b118 100644 --- a/components/ws-manager-api/go/crd/v1/workspace_types.go +++ b/components/ws-manager-api/go/crd/v1/workspace_types.go @@ -181,11 +181,17 @@ type WorkspaceImageInfo struct { // WorkspaceStatus defines the observed state of Workspace type WorkspaceStatus struct { - PodStarts int `json:"podStarts"` - PodRecreated int `json:"podRecreated"` + PodStarts int `json:"podStarts"` + + // +kubebuilder:validation:Optional + PodRecreated int `json:"podRecreated"` + // +kubebuilder:validation:Optional PodDeletionTime *metav1.Time `json:"podDeletionTime,omitempty"` - URL string `json:"url,omitempty" scrub:"redact"` - OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"` + // +kubebuilder:validation:Optional + PodStoppingTime *metav1.Time `json:"podStoppingTime,omitempty"` + + URL string `json:"url,omitempty" scrub:"redact"` + OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"` // +kubebuilder:default=Unknown Phase WorkspacePhase `json:"phase,omitempty"` @@ -285,6 +291,9 @@ const ( // WorkspaceConditionStateWiped is true once all state has successfully been wiped by ws-daemon. This is only set if PodRejected=true, and the rejected workspace has been deleted. WorkspaceConditionStateWiped WorkspaceCondition = "StateWiped" + + // WorkspaceConditionForceKilledTask is true if we send a SIGKILL to the task + WorkspaceConditionForceKilledTask WorkspaceCondition = "ForceKilledTask" ) func NewWorkspaceConditionDeployed() metav1.Condition { @@ -439,6 +448,14 @@ func NewWorkspaceConditionContainerRunning(status metav1.ConditionStatus) metav1 } } +func NewWorkspaceConditionForceKilledTask() metav1.Condition { + return metav1.Condition{ + Type: string(WorkspaceConditionForceKilledTask), + LastTransitionTime: metav1.Now(), + Status: metav1.ConditionTrue, + } +} + // +kubebuilder:validation:Enum:=Unknown;Pending;Imagebuild;Creating;Initializing;Running;Stopping;Stopped type WorkspacePhase string diff --git a/components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go b/components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go index 0a92d5461c5651..600143ec67d9d6 100644 --- a/components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go +++ b/components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go @@ -440,6 +440,14 @@ func (in *WorkspaceSpec) DeepCopy() *WorkspaceSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) { *out = *in + if in.PodDeletionTime != nil { + in, out := &in.PodDeletionTime, &out.PodDeletionTime + *out = (*in).DeepCopy() + } + if in.PodStoppingTime != nil { + in, out := &in.PodStoppingTime, &out.PodStoppingTime + *out = (*in).DeepCopy() + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml index 738a6d05eb4d10..765c0b1700dd93 100644 --- a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml +++ b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml @@ -548,11 +548,14 @@ spec: - Stopping - Stopped type: string - podStarts: - type: integer + podDeletionTime: + format: date-time + type: string podRecreated: type: integer - podDeletionTime: + podStarts: + type: integer + podStoppingTime: format: date-time type: string runtime: diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go index 115854b8753211..5e5f33f1a42635 100644 --- a/components/ws-manager-mk2/controllers/status.go +++ b/components/ws-manager-mk2/controllers/status.go @@ -59,6 +59,10 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa defer func() { if oldPhase != workspace.Status.Phase { log.Info("workspace phase updated", "oldPhase", oldPhase, "phase", workspace.Status.Phase) + if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping { + t := metav1.Now() + workspace.Status.PodStoppingTime = &t + } } }()