@@ -16,46 +16,52 @@ package vm
16
16
import (
17
17
"context"
18
18
"net"
19
+ "strings"
19
20
"time"
20
21
21
22
"github.com/mdlayher/vsock"
22
23
"github.com/sirupsen/logrus"
23
24
)
24
25
26
+ const (
27
+ vsockConnectTimeout = 20 * time .Second
28
+ )
29
+
25
30
// VSockDial attempts to connect to a vsock listener at the provided cid and port with a hardcoded number
26
31
// of retries.
27
32
func VSockDial (reqCtx context.Context , logger * logrus.Entry , contextID , port uint32 ) (net.Conn , error ) {
28
- // VM should start within 200ms, vsock dial will make retries at 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s
29
- const (
30
- retryCount = 7
31
- initialDelay = 100 * time .Millisecond
32
- delayMultiplier = 2
33
- )
33
+ // Retries occur every 100ms up to vsockConnectTimeout
34
+ const retryInterval = 100 * time .Millisecond
35
+ ctx , cancel := context .WithTimeout (reqCtx , vsockConnectTimeout )
36
+ defer cancel ()
34
37
35
- var lastErr error
36
- var currentDelay = initialDelay
38
+ var attemptCount int
39
+ for range time .NewTicker (retryInterval ).C {
40
+ attemptCount ++
41
+ logger = logger .WithField ("attempt" , attemptCount )
37
42
38
- for i := 1 ; i <= retryCount ; i ++ {
39
43
select {
40
- case <- reqCtx .Done ():
41
- return nil , reqCtx .Err ()
44
+ case <- ctx .Done ():
45
+ return nil , ctx .Err ()
42
46
default :
43
47
conn , err := vsock .Dial (contextID , port )
44
48
if err == nil {
45
- logger .WithField ("connection" , conn ).Debug ("Dial succeeded" )
49
+ logger .WithField ("connection" , conn ).Debug ("vsock dial succeeded" )
46
50
return conn , nil
47
51
}
48
52
49
- logger .WithError (err ).Warnf ("vsock dial failed (attempt %d of %d), will retry in %s" , i , retryCount , currentDelay )
50
- time .Sleep (currentDelay )
53
+ // ENXIO and ECONNRESET can be returned while the VM+agent are still in the midst of booting
54
+ if isTemporaryNetErr (err ) || isENXIO (err ) || isECONNRESET (err ) {
55
+ logger .WithError (err ).Debug ("temporary vsock dial failure" )
56
+ continue
57
+ }
51
58
52
- lastErr = err
53
- currentDelay *= delayMultiplier
59
+ logger . WithError ( err ). Error ( "non-temporary vsock dial failure" )
60
+ return nil , err
54
61
}
55
62
}
56
63
57
- logger .WithError (lastErr ).WithFields (logrus.Fields {"context_id" : contextID , "port" : port }).Error ("vsock dial failed" )
58
- return nil , lastErr
64
+ panic ("unreachable code" ) // appeases the compiler, which doesn't know the for loop is infinite
59
65
}
60
66
61
67
// VSockDialConnector provides an IOConnector interface to the VSockDial function.
@@ -77,6 +83,47 @@ func VSockDialConnector(contextID, port uint32) IOConnector {
77
83
}
78
84
}
79
85
86
+ func vsockAccept (reqCtx context.Context , logger * logrus.Entry , port uint32 ) (net.Conn , error ) {
87
+ listener , err := vsock .Listen (port )
88
+ if err != nil {
89
+ return nil , err
90
+ }
91
+
92
+ defer listener .Close ()
93
+
94
+ // Retries occur every 10ms up to vsockConnectTimeout
95
+ const retryInterval = 10 * time .Millisecond
96
+ ctx , cancel := context .WithTimeout (reqCtx , vsockConnectTimeout )
97
+ defer cancel ()
98
+
99
+ var attemptCount int
100
+ for range time .NewTicker (retryInterval ).C {
101
+ attemptCount ++
102
+ logger = logger .WithField ("attempt" , attemptCount )
103
+
104
+ select {
105
+ case <- ctx .Done ():
106
+ return nil , ctx .Err ()
107
+ default :
108
+ // accept is non-blocking so try to accept until we get a connection
109
+ conn , err := listener .Accept ()
110
+ if err == nil {
111
+ return conn , nil
112
+ }
113
+
114
+ if isTemporaryNetErr (err ) {
115
+ logger .WithError (err ).Debug ("temporary stdio vsock accept failure" )
116
+ continue
117
+ }
118
+
119
+ logger .WithError (err ).Error ("non-temporary stdio vsock accept failure" )
120
+ return nil , err
121
+ }
122
+ }
123
+
124
+ panic ("unreachable code" ) // appeases the compiler, which doesn't know the for loop is infinite
125
+ }
126
+
80
127
// VSockAcceptConnector provides an IOConnector that establishes the connection by listening on the provided
81
128
// vsock port and accepting the first connection that comes in.
82
129
func VSockAcceptConnector (port uint32 ) IOConnector {
@@ -86,47 +133,11 @@ func VSockAcceptConnector(port uint32) IOConnector {
86
133
go func () {
87
134
defer close (returnCh )
88
135
89
- listener , err := vsock .Listen (port )
90
- if err != nil {
91
- returnCh <- IOConnectorResult {
92
- Err : err ,
93
- }
94
- return
95
- }
96
-
97
- defer listener .Close ()
98
-
99
- for range time .NewTicker (10 * time .Millisecond ).C {
100
- select {
101
- case <- procCtx .Done ():
102
- returnCh <- IOConnectorResult {
103
- Err : procCtx .Err (),
104
- }
105
- return
106
- default :
107
- // accept is non-blocking so try to accept until we get a connection
108
- conn , err := listener .Accept ()
109
- if err == nil {
110
- returnCh <- IOConnectorResult {
111
- ReadWriteCloser : conn ,
112
- }
113
- return
114
- }
115
-
116
- if isTemporaryNetErr (err ) {
117
- logger .WithError (err ).Debug ("temporary stdio vsock accept failure" )
118
- continue
119
- }
120
-
121
- logger .WithError (err ).Error ("non-temporary stdio vsock accept failure" )
122
- returnCh <- IOConnectorResult {
123
- Err : err ,
124
- }
125
- return
126
- }
136
+ conn , err := vsockAccept (procCtx , logger , port )
137
+ returnCh <- IOConnectorResult {
138
+ ReadWriteCloser : conn ,
139
+ Err : err ,
127
140
}
128
-
129
- panic ("unreachable code" ) // appeases the compiler, which doesn't know the for loop is infinite
130
141
}()
131
142
132
143
return returnCh
@@ -140,3 +151,16 @@ func isTemporaryNetErr(err error) bool {
140
151
141
152
return err != nil && ok && terr .Temporary ()
142
153
}
154
+
155
+ // Unfortunately, as "documented" on various online forums, there's no ideal way to
156
+ // test for actual Linux error codes returned by the net library or wrappers
157
+ // around that library. The common approach is to fall back on string matching,
158
+ // which is done for the functions below
159
+
160
+ func isENXIO (err error ) bool {
161
+ return strings .HasSuffix (err .Error (), "no such device" )
162
+ }
163
+
164
+ func isECONNRESET (err error ) bool {
165
+ return strings .HasSuffix (err .Error (), "connection reset by peer" )
166
+ }
0 commit comments