Skip to content

Commit 6fc55ba

Browse files
sipsmaaustinvazquez
authored andcommitted
Use constant retry intervals+timeouts in vsock.
Signed-off-by: Erik Sipsma <[email protected]>
1 parent cc46ca2 commit 6fc55ba

File tree

1 file changed

+82
-58
lines changed

1 file changed

+82
-58
lines changed

internal/vm/vsock.go

Lines changed: 82 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16,46 +16,52 @@ package vm
1616
import (
1717
"context"
1818
"net"
19+
"strings"
1920
"time"
2021

2122
"github.com/mdlayher/vsock"
2223
"github.com/sirupsen/logrus"
2324
)
2425

26+
const (
27+
vsockConnectTimeout = 20 * time.Second
28+
)
29+
2530
// VSockDial attempts to connect to a vsock listener at the provided cid and port with a hardcoded number
2631
// of retries.
2732
func VSockDial(reqCtx context.Context, logger *logrus.Entry, contextID, port uint32) (net.Conn, error) {
28-
// VM should start within 200ms, vsock dial will make retries at 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s
29-
const (
30-
retryCount = 7
31-
initialDelay = 100 * time.Millisecond
32-
delayMultiplier = 2
33-
)
33+
// Retries occur every 100ms up to vsockConnectTimeout
34+
const retryInterval = 100 * time.Millisecond
35+
ctx, cancel := context.WithTimeout(reqCtx, vsockConnectTimeout)
36+
defer cancel()
3437

35-
var lastErr error
36-
var currentDelay = initialDelay
38+
var attemptCount int
39+
for range time.NewTicker(retryInterval).C {
40+
attemptCount++
41+
logger = logger.WithField("attempt", attemptCount)
3742

38-
for i := 1; i <= retryCount; i++ {
3943
select {
40-
case <-reqCtx.Done():
41-
return nil, reqCtx.Err()
44+
case <-ctx.Done():
45+
return nil, ctx.Err()
4246
default:
4347
conn, err := vsock.Dial(contextID, port)
4448
if err == nil {
45-
logger.WithField("connection", conn).Debug("Dial succeeded")
49+
logger.WithField("connection", conn).Debug("vsock dial succeeded")
4650
return conn, nil
4751
}
4852

49-
logger.WithError(err).Warnf("vsock dial failed (attempt %d of %d), will retry in %s", i, retryCount, currentDelay)
50-
time.Sleep(currentDelay)
53+
// ENXIO and ECONNRESET can be returned while the VM+agent are still in the midst of booting
54+
if isTemporaryNetErr(err) || isENXIO(err) || isECONNRESET(err) {
55+
logger.WithError(err).Debug("temporary vsock dial failure")
56+
continue
57+
}
5158

52-
lastErr = err
53-
currentDelay *= delayMultiplier
59+
logger.WithError(err).Error("non-temporary vsock dial failure")
60+
return nil, err
5461
}
5562
}
5663

57-
logger.WithError(lastErr).WithFields(logrus.Fields{"context_id": contextID, "port": port}).Error("vsock dial failed")
58-
return nil, lastErr
64+
panic("unreachable code") // appeases the compiler, which doesn't know the for loop is infinite
5965
}
6066

6167
// VSockDialConnector provides an IOConnector interface to the VSockDial function.
@@ -77,6 +83,47 @@ func VSockDialConnector(contextID, port uint32) IOConnector {
7783
}
7884
}
7985

86+
func vsockAccept(reqCtx context.Context, logger *logrus.Entry, port uint32) (net.Conn, error) {
87+
listener, err := vsock.Listen(port)
88+
if err != nil {
89+
return nil, err
90+
}
91+
92+
defer listener.Close()
93+
94+
// Retries occur every 10ms up to vsockConnectTimeout
95+
const retryInterval = 10 * time.Millisecond
96+
ctx, cancel := context.WithTimeout(reqCtx, vsockConnectTimeout)
97+
defer cancel()
98+
99+
var attemptCount int
100+
for range time.NewTicker(retryInterval).C {
101+
attemptCount++
102+
logger = logger.WithField("attempt", attemptCount)
103+
104+
select {
105+
case <-ctx.Done():
106+
return nil, ctx.Err()
107+
default:
108+
// accept is non-blocking so try to accept until we get a connection
109+
conn, err := listener.Accept()
110+
if err == nil {
111+
return conn, nil
112+
}
113+
114+
if isTemporaryNetErr(err) {
115+
logger.WithError(err).Debug("temporary stdio vsock accept failure")
116+
continue
117+
}
118+
119+
logger.WithError(err).Error("non-temporary stdio vsock accept failure")
120+
return nil, err
121+
}
122+
}
123+
124+
panic("unreachable code") // appeases the compiler, which doesn't know the for loop is infinite
125+
}
126+
80127
// VSockAcceptConnector provides an IOConnector that establishes the connection by listening on the provided
81128
// vsock port and accepting the first connection that comes in.
82129
func VSockAcceptConnector(port uint32) IOConnector {
@@ -86,47 +133,11 @@ func VSockAcceptConnector(port uint32) IOConnector {
86133
go func() {
87134
defer close(returnCh)
88135

89-
listener, err := vsock.Listen(port)
90-
if err != nil {
91-
returnCh <- IOConnectorResult{
92-
Err: err,
93-
}
94-
return
95-
}
96-
97-
defer listener.Close()
98-
99-
for range time.NewTicker(10 * time.Millisecond).C {
100-
select {
101-
case <-procCtx.Done():
102-
returnCh <- IOConnectorResult{
103-
Err: procCtx.Err(),
104-
}
105-
return
106-
default:
107-
// accept is non-blocking so try to accept until we get a connection
108-
conn, err := listener.Accept()
109-
if err == nil {
110-
returnCh <- IOConnectorResult{
111-
ReadWriteCloser: conn,
112-
}
113-
return
114-
}
115-
116-
if isTemporaryNetErr(err) {
117-
logger.WithError(err).Debug("temporary stdio vsock accept failure")
118-
continue
119-
}
120-
121-
logger.WithError(err).Error("non-temporary stdio vsock accept failure")
122-
returnCh <- IOConnectorResult{
123-
Err: err,
124-
}
125-
return
126-
}
136+
conn, err := vsockAccept(procCtx, logger, port)
137+
returnCh <- IOConnectorResult{
138+
ReadWriteCloser: conn,
139+
Err: err,
127140
}
128-
129-
panic("unreachable code") // appeases the compiler, which doesn't know the for loop is infinite
130141
}()
131142

132143
return returnCh
@@ -140,3 +151,16 @@ func isTemporaryNetErr(err error) bool {
140151

141152
return err != nil && ok && terr.Temporary()
142153
}
154+
155+
// Unfortunately, as "documented" on various online forums, there's no ideal way to
156+
// test for actual Linux error codes returned by the net library or wrappers
157+
// around that library. The common approach is to fall back on string matching,
158+
// which is done for the functions below
159+
160+
func isENXIO(err error) bool {
161+
return strings.HasSuffix(err.Error(), "no such device")
162+
}
163+
164+
func isECONNRESET(err error) bool {
165+
return strings.HasSuffix(err.Error(), "connection reset by peer")
166+
}

0 commit comments

Comments
 (0)