Skip to content

Commit 2939a05

Browse files
sipsmaaustinvazquez
authored andcommitted
Exit shim upon Task delete, not exit.
Issue #87 was previously addressed by having our shim process exit when all tasks it was managing exited. However, it turns out that is not the expected behavior for a containerd shim. The expected behavior is that a shim continue running until all tasks have been *deleted* by the client (via the containerd Delete API call). When any task has been deleted (or create of a task fails) containerd will send the shim a Shutdown API call, at which time the shim is expected to exit if it *all* tasks it was managing have been deleted. This commit fixes brings our shim in line with that expected behavior. It, however, also retains the existing FCControl parameter "ExitAfterAllTasksGone" (though now renamed to "ExitAfterAllTasksDeleted") to give users the ability to specify whether the default containerd shim behavior should be ignored and instead allow the Shim+VM to keep running even after all Tasks have been deleted. The majority of the changes here actually end up just being a refactor of the TaskManager interface to safely handle Create/Delete task APIs alongside checking whether the shim has any tasks left being managed (which is uses to decide if it should shutdown). The refactorization also ensures that all IO is flushed before a Delete call returns (which is a better solution to handling I/O races than the temporary fix applied in 1e36219). Signed-off-by: Erik Sipsma <[email protected]>
1 parent a6bf179 commit 2939a05

File tree

1 file changed

+142
-0
lines changed

1 file changed

+142
-0
lines changed

internal/vm/vsock.go

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
// not use this file except in compliance with the License. A copy of the
5+
// License is located at
6+
//
7+
// http://aws.amazon.com/apache2.0/
8+
//
9+
// or in the "license" file accompanying this file. This file is distributed
10+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
// express or implied. See the License for the specific language governing
12+
// permissions and limitations under the License.
13+
14+
package vm
15+
16+
import (
17+
"context"
18+
"net"
19+
"time"
20+
21+
"github.com/mdlayher/vsock"
22+
"github.com/sirupsen/logrus"
23+
)
24+
25+
// VSockDial attempts to connect to a vsock listener at the provided cid and port with a hardcoded number
26+
// of retries.
27+
func VSockDial(reqCtx context.Context, logger *logrus.Entry, contextID, port uint32) (net.Conn, error) {
28+
// VM should start within 200ms, vsock dial will make retries at 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s
29+
const (
30+
retryCount = 7
31+
initialDelay = 100 * time.Millisecond
32+
delayMultiplier = 2
33+
)
34+
35+
var lastErr error
36+
var currentDelay = initialDelay
37+
38+
for i := 1; i <= retryCount; i++ {
39+
select {
40+
case <-reqCtx.Done():
41+
return nil, reqCtx.Err()
42+
default:
43+
conn, err := vsock.Dial(contextID, port)
44+
if err == nil {
45+
logger.WithField("connection", conn).Debug("Dial succeeded")
46+
return conn, nil
47+
}
48+
49+
logger.WithError(err).Warnf("vsock dial failed (attempt %d of %d), will retry in %s", i, retryCount, currentDelay)
50+
time.Sleep(currentDelay)
51+
52+
lastErr = err
53+
currentDelay *= delayMultiplier
54+
}
55+
}
56+
57+
logger.WithError(lastErr).WithFields(logrus.Fields{"context_id": contextID, "port": port}).Error("vsock dial failed")
58+
return nil, lastErr
59+
}
60+
61+
// VSockDialConnector provides an IOConnector interface to the VSockDial function.
62+
func VSockDialConnector(contextID, port uint32) IOConnector {
63+
return func(taskCtx context.Context, logger *logrus.Entry) <-chan IOConnectorResult {
64+
returnCh := make(chan IOConnectorResult)
65+
66+
go func() {
67+
defer close(returnCh)
68+
69+
conn, err := VSockDial(taskCtx, logger, contextID, port)
70+
returnCh <- IOConnectorResult{
71+
ReadWriteCloser: conn,
72+
Err: err,
73+
}
74+
}()
75+
76+
return returnCh
77+
}
78+
}
79+
80+
// VSockAcceptConnector provides an IOConnector that establishes the connection by listening on the provided
81+
// vsock port and accepting the first connection that comes in.
82+
func VSockAcceptConnector(port uint32) IOConnector {
83+
return func(reqCtx context.Context, logger *logrus.Entry) <-chan IOConnectorResult {
84+
returnCh := make(chan IOConnectorResult)
85+
86+
go func() {
87+
defer close(returnCh)
88+
89+
listener, err := vsock.Listen(port)
90+
if err != nil {
91+
returnCh <- IOConnectorResult{
92+
Err: err,
93+
}
94+
return
95+
}
96+
97+
defer listener.Close()
98+
99+
for range time.NewTicker(10 * time.Millisecond).C {
100+
select {
101+
case <-reqCtx.Done():
102+
returnCh <- IOConnectorResult{
103+
Err: procCtx.Err(),
104+
}
105+
return
106+
default:
107+
// accept is non-blocking so try to accept until we get a connection
108+
conn, err := listener.Accept()
109+
if err == nil {
110+
returnCh <- IOConnectorResult{
111+
ReadWriteCloser: conn,
112+
}
113+
return
114+
}
115+
116+
if isTemporaryNetErr(err) {
117+
logger.WithError(err).Debug("temporary stdio vsock accept failure")
118+
continue
119+
}
120+
121+
logger.WithError(err).Error("non-temporary stdio vsock accept failure")
122+
returnCh <- IOConnectorResult{
123+
Err: err,
124+
}
125+
return
126+
}
127+
}
128+
129+
panic("unreachable code") // appeases the compiler, which doesn't know the for loop is infinite
130+
}()
131+
132+
return returnCh
133+
}
134+
}
135+
136+
func isTemporaryNetErr(err error) bool {
137+
terr, ok := err.(interface {
138+
Temporary() bool
139+
})
140+
141+
return err != nil && ok && terr.Temporary()
142+
}

0 commit comments

Comments
 (0)