Skip to content

Commit 5d2c988

Browse files
authored
Merge pull request containerd#10201 from abel-von/retry-remote-sandbox-wait
sandbox: do retry for wait to remote sandbox controller
2 parents 6383a1c + 58be881 commit 5d2c988

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

core/sandbox/proxy/controller.go

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package proxy
1818

1919
import (
2020
"context"
21+
"time"
2122

2223
api "github.com/containerd/containerd/api/services/sandbox/v1"
2324
"github.com/containerd/containerd/api/types"
@@ -119,9 +120,31 @@ func (s *remoteSandboxController) Shutdown(ctx context.Context, sandboxID string
119120
}
120121

121122
func (s *remoteSandboxController) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) {
122-
resp, err := s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID})
123-
if err != nil {
124-
return sandbox.ExitStatus{}, errdefs.FromGRPC(err)
123+
// For remote sandbox controllers, the controller process may restart,
124+
// we have to retry if the error indicates that it is the grpc disconnection.
125+
var (
126+
resp *api.ControllerWaitResponse
127+
err error
128+
retryInterval time.Duration = 128
129+
)
130+
for {
131+
resp, err = s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID})
132+
if err != nil {
133+
grpcErr := errdefs.FromGRPC(err)
134+
if !errdefs.IsUnavailable(grpcErr) {
135+
return sandbox.ExitStatus{}, grpcErr
136+
}
137+
select {
138+
case <-time.After(retryInterval * time.Millisecond):
139+
if retryInterval < 4096 {
140+
retryInterval = retryInterval << 1
141+
}
142+
continue
143+
case <-ctx.Done():
144+
return sandbox.ExitStatus{}, grpcErr
145+
}
146+
}
147+
break
125148
}
126149

127150
return sandbox.ExitStatus{

internal/cri/server/events.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (c *criService) startSandboxExitMonitor(ctx context.Context, id string, exi
5353
case exitRes := <-exitCh:
5454
exitStatus, exitedAt, err := exitRes.Result()
5555
if err != nil {
56-
log.L.WithError(err).Errorf("failed to get task exit status for %q", id)
56+
log.L.WithError(err).Errorf("failed to get sandbox status for %q", id)
5757
exitStatus = unknownExitCode
5858
exitedAt = time.Now()
5959
}

0 commit comments

Comments
 (0)