Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/workspacekit/cmd/rings.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ var ring0Cmd = &cobra.Command{

_ = cmd.Process.Signal(unix.SIGTERM)
time.Sleep(ring1ShutdownTimeout)
if cmd.Process == nil {
if cmd.Process == nil || cmd.ProcessState.Exited() {
return
}

Expand Down
5 changes: 5 additions & 0 deletions components/ws-daemon/pkg/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"golang.org/x/xerrors"

"github.com/containerd/containerd/api/types/task"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

Expand Down Expand Up @@ -54,6 +55,10 @@ type Runtime interface {

// DisposeContainer removes a stopped container, and everything we know about it
DisposeContainer(ctx context.Context, workspaceInstanceID string)

GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error)

ForceKillContainerTask(ctx context.Context, id ID) error
}

var (
Expand Down
23 changes: 23 additions & 0 deletions components/ws-daemon/pkg/container/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/containerd/containerd/api/events"
"github.com/containerd/containerd/api/services/tasks/v1"
"github.com/containerd/containerd/api/types"
"github.com/containerd/containerd/api/types/task"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/images"
Expand Down Expand Up @@ -576,6 +577,28 @@ func (s *Containerd) IsContainerdReady(ctx context.Context) (bool, error) {
return true, nil
}

func (s *Containerd) GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error) {
task, err := s.Client.TaskService().Get(ctx, &tasks.GetRequest{
ContainerID: string(id),
})
if err != nil {
return nil, err
}
if task.Process == nil {
return nil, fmt.Errorf("task has no process")
}
return task.Process, nil
}

func (s *Containerd) ForceKillContainerTask(ctx context.Context, id ID) error {
_, err := s.Client.TaskService().Kill(ctx, &tasks.KillRequest{
ContainerID: string(id),
Signal: 9,
All: true,
})
return err
}

var kubepodsQoSRegexp = regexp.MustCompile(`([^/]+)-([^/]+)-pod`)
var kubepodsRegexp = regexp.MustCompile(`([^/]+)-pod`)

Expand Down
68 changes: 65 additions & 3 deletions components/ws-daemon/pkg/controller/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"

"google.golang.org/protobuf/proto"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -349,9 +350,43 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp

if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
// Container is still running, we need to wait for it to stop.
// We should get an event when the condition changes, but requeue
// anyways to make sure we act on it in time.
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
// We will wait for this situation for up to 5 minutes.
// If the container is still in a running state after that,
// there may be an issue with state synchronization.
// We should start backup anyway to avoid data loss.
if !(ws.Status.PodStoppingTime != nil && time.Since(ws.Status.PodStoppingTime.Time) > 5*time.Minute) {
// We should get an event when the condition changes, but requeue
// anyways to make sure we act on it in time.
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
}

if !ws.IsConditionTrue(workspacev1.WorkspaceConditionForceKilledTask) {
err = wsc.forceKillContainerTask(ctx, ws)
if err != nil {
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to force kill task: %v", err)
}
err = retry.RetryOnConflict(retryParams, func() error {
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
return err
}
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionForceKilledTask())
return wsc.Client.Status().Update(ctx, ws)
})
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set force killed task condition: %w", err)
}
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
}

if time.Since(wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionForceKilledTask)).LastTransitionTime.Time) < 2*time.Second {
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
}

glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Warn("workspace container is still running after 5 minutes of deletion, starting backup anyway")
err = wsc.dumpWorkspaceContainerInfo(ctx, ws)
if err != nil {
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to dump container info: %v", err)
}
}

if wsc.latestWorkspace(ctx, ws) != nil {
Expand Down Expand Up @@ -442,6 +477,33 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp
return ctrl.Result{}, nil
}

func (wsc *WorkspaceController) dumpWorkspaceContainerInfo(ctx context.Context, ws *workspacev1.Workspace) error {
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
if err != nil {
return fmt.Errorf("failed to wait for container: %w", err)
}
task, err := wsc.runtime.GetContainerTaskInfo(ctx, id)
if err != nil {
return fmt.Errorf("failed to get container task info: %w", err)
}
glog.WithFields(ws.OWI()).WithFields(logrus.Fields{
"containerID": id,
"exitStatus": task.ExitStatus,
"pid": task.Pid,
"exitedAt": task.ExitedAt.AsTime(),
"status": task.Status.String(),
}).Info("container task info")
return nil
}

func (wsc *WorkspaceController) forceKillContainerTask(ctx context.Context, ws *workspacev1.Workspace) error {
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
if err != nil {
return fmt.Errorf("failed to wait for container: %w", err)
}
return wsc.runtime.ForceKillContainerTask(ctx, id)
}

func (wsc *WorkspaceController) prepareInitializer(ctx context.Context, ws *workspacev1.Workspace) (*csapi.WorkspaceInitializer, error) {
var init csapi.WorkspaceInitializer
err := proto.Unmarshal(ws.Spec.Initializer, &init)
Expand Down
25 changes: 21 additions & 4 deletions components/ws-manager-api/go/crd/v1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,17 @@ type WorkspaceImageInfo struct {

// WorkspaceStatus defines the observed state of Workspace
type WorkspaceStatus struct {
PodStarts int `json:"podStarts"`
PodRecreated int `json:"podRecreated"`
PodStarts int `json:"podStarts"`

// +kubebuilder:validation:Optional
PodRecreated int `json:"podRecreated"`
// +kubebuilder:validation:Optional
PodDeletionTime *metav1.Time `json:"podDeletionTime,omitempty"`
URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
// +kubebuilder:validation:Optional
PodStoppingTime *metav1.Time `json:"podStoppingTime,omitempty"`

URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`

// +kubebuilder:default=Unknown
Phase WorkspacePhase `json:"phase,omitempty"`
Expand Down Expand Up @@ -285,6 +291,9 @@ const (

// WorkspaceConditionStateWiped is true once all state has successfully been wiped by ws-daemon. This is only set if PodRejected=true, and the rejected workspace has been deleted.
WorkspaceConditionStateWiped WorkspaceCondition = "StateWiped"

// WorkspaceConditionForceKilledTask is true if we send a SIGKILL to the task
WorkspaceConditionForceKilledTask WorkspaceCondition = "ForceKilledTask"
)

func NewWorkspaceConditionDeployed() metav1.Condition {
Expand Down Expand Up @@ -439,6 +448,14 @@ func NewWorkspaceConditionContainerRunning(status metav1.ConditionStatus) metav1
}
}

func NewWorkspaceConditionForceKilledTask() metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionForceKilledTask),
LastTransitionTime: metav1.Now(),
Status: metav1.ConditionTrue,
}
}

// +kubebuilder:validation:Enum:=Unknown;Pending;Imagebuild;Creating;Initializing;Running;Stopping;Stopped
type WorkspacePhase string

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -548,11 +548,14 @@ spec:
- Stopping
- Stopped
type: string
podStarts:
type: integer
podDeletionTime:
format: date-time
type: string
podRecreated:
type: integer
podDeletionTime:
podStarts:
type: integer
podStoppingTime:
format: date-time
type: string
runtime:
Expand Down
4 changes: 4 additions & 0 deletions components/ws-manager-mk2/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
defer func() {
if oldPhase != workspace.Status.Phase {
log.Info("workspace phase updated", "oldPhase", oldPhase, "phase", workspace.Status.Phase)
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping {
t := metav1.Now()
workspace.Status.PodStoppingTime = &t
}
}
}()

Expand Down
Loading