Skip to content

Commit 0e0414e

Browse files
committed
promote nmagent retry to package, include final retry error, and adapt existing cni code
include last error cause when cooldown function errors (discuss) for compatability use durations instead of millis correct usage of number retries vs number of runs create helper function for retrying in transparent vlan mode create wrapper method that makes all passed in errors temporary (retriable) errors
1 parent ac4f51c commit 0e0414e

File tree

10 files changed

+395
-337
lines changed

10 files changed

+395
-337
lines changed

dhcp/dhcp_windows.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package dhcp
33
import (
44
"context"
55
"net"
6+
"time"
67

78
"github.com/Azure/azure-container-networking/retry"
89
"github.com/pkg/errors"
@@ -11,10 +12,10 @@ import (
1112
)
1213

1314
const (
14-
retryCount = 5
15-
retryDelayMillis = 500
16-
ipAssignRetryDelayMillis = 2000
17-
socketTimeoutMillis = 1000
15+
retryCount = 4
16+
retryDelay = 500 * time.Millisecond
17+
ipAssignRetryDelay = 2000 * time.Millisecond
18+
socketTimeoutMillis = 1000
1819
)
1920

2021
var (
@@ -106,21 +107,24 @@ func (c *DHCP) getIPv4InterfaceAddresses(ifName string) ([]net.IP, error) {
106107
return ret, err
107108
}
108109

109-
func (c *DHCP) verifyIPv4InterfaceAddressCount(ifName string, count, maxRuns, sleepMs int) error {
110-
addressCountErr := retry.Do(func() error {
110+
func (c *DHCP) verifyIPv4InterfaceAddressCount(ifName string, count, maxRuns int, sleep time.Duration) error {
111+
retrier := retry.Retrier{
112+
Cooldown: retry.Max(maxRuns, retry.Fixed(sleep)),
113+
}
114+
addressCountErr := retrier.Do(context.Background(), func() error {
111115
addresses, err := c.getIPv4InterfaceAddresses(ifName)
112116
if err != nil || len(addresses) != count {
113117
return errIncorrectAddressCount
114118
}
115119
return nil
116-
}, maxRuns, sleepMs)
120+
})
117121
return addressCountErr
118122
}
119123

120124
// issues a dhcp discover request on an interface by finding the secondary's ip and sending on its ip
121125
func (c *DHCP) DiscoverRequest(ctx context.Context, macAddress net.HardwareAddr, ifName string) error {
122126
// Find the ipv4 address of the secondary interface (we're betting that this gets autoconfigured)
123-
err := c.verifyIPv4InterfaceAddressCount(ifName, 1, retryCount, ipAssignRetryDelayMillis)
127+
err := c.verifyIPv4InterfaceAddressCount(ifName, 1, retryCount, ipAssignRetryDelay)
124128
if err != nil {
125129
return errors.Wrap(err, "failed to get auto ip config assigned in apipa range in time")
126130
}
@@ -173,11 +177,14 @@ func (c *DHCP) DiscoverRequest(ctx context.Context, macAddress net.HardwareAddr,
173177
return errors.Wrap(err, "failed to create socket")
174178
}
175179

180+
retrier := retry.Retrier{
181+
Cooldown: retry.Max(retryCount, retry.Fixed(retryDelay)),
182+
}
176183
// retry sending the packet until it succeeds
177-
err = retry.Do(func() error {
184+
err = retrier.Do(context.Background(), func() error {
178185
_, sockErr := sock.Write(bytesToSend)
179186
return sockErr
180-
}, retryCount, retryDelayMillis)
187+
})
181188
if err != nil {
182189
return errors.Wrap(err, "failed to write to dhcp socket")
183190
}

network/transparent_vlan_endpointclient_linux.go

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package network
22

33
import (
4+
"context"
45
"fmt"
56
"net"
67
"strings"
8+
"time"
79

810
"github.com/Azure/azure-container-networking/iptables"
911
"github.com/Azure/azure-container-networking/netio"
@@ -26,8 +28,8 @@ const (
2628
tunnelingTable = 2 // Packets not entering on the vlan interface go to this routing table
2729
tunnelingMark = 333 // The packets that are to tunnel will be marked with this number
2830
DisableRPFilterCmd = "sysctl -w net.ipv4.conf.all.rp_filter=0" // Command to disable the rp filter for tunneling
29-
numRetries = 5
30-
sleepInMs = 100
31+
numRetries = 4
32+
sleepDelay = 100 * time.Millisecond
3133
)
3234

3335
var errNamespaceCreation = fmt.Errorf("network namespace creation error")
@@ -192,13 +194,13 @@ func (client *TransparentVlanEndpointClient) setLinkNetNSAndConfirm(name string,
192194
}
193195

194196
// confirm veth was moved successfully
195-
err = retry.Do(func() error {
197+
err = client.Retry(func() error {
196198
// retry checking in the namespace if the interface is not detected
197199
return ExecuteInNS(client.nsClient, client.vnetNSName, func() error {
198200
_, ifDetectedErr := client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
199201
return errors.Wrap(ifDetectedErr, "failed to get vlan veth in namespace")
200202
})
201-
}, numRetries, sleepInMs)
203+
})
202204
if err != nil {
203205
return errors.Wrapf(err, "failed to detect %v inside namespace %v", name, fd)
204206
}
@@ -220,9 +222,9 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
220222
// We assume the only possible error is that the namespace doesn't exist
221223
logger.Info("No existing NS detected. Creating the vnet namespace and switching to it", zap.String("message", existingErr.Error()))
222224

223-
err = retry.Do(func() error {
225+
err = client.Retry(func() error {
224226
return client.createNetworkNamespace(vmNS)
225-
}, numRetries, sleepInMs)
227+
})
226228
if err != nil {
227229
return errors.Wrap(err, "failed to create network namespace")
228230
}
@@ -279,10 +281,10 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
279281
}()
280282

281283
// sometimes there is slight delay in interface creation. check if it exists
282-
err = retry.Do(func() error {
284+
err = client.Retry(func() error {
283285
_, err = client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
284286
return errors.Wrap(err, "failed to get vlan veth")
285-
}, numRetries, sleepInMs)
287+
})
286288

287289
if err != nil {
288290
deleteNSIfNotNilErr = errors.Wrapf(err, "failed to get vlan veth interface:%s", client.vlanIfName)
@@ -316,21 +318,21 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
316318
}
317319

318320
// Ensure vnet veth is created, as there may be a slight delay
319-
err = retry.Do(func() error {
321+
err = client.Retry(func() error {
320322
_, getErr := client.netioshim.GetNetworkInterfaceByName(client.vnetVethName)
321323
return errors.Wrap(getErr, "failed to get vnet veth")
322-
}, numRetries, sleepInMs)
324+
})
323325
if err != nil {
324326
return errors.Wrap(err, "vnet veth does not exist")
325327
}
326328

327329
// Ensure container veth is created, as there may be a slight delay
328330
var containerIf *net.Interface
329-
err = retry.Do(func() error {
331+
err = client.Retry(func() error {
330332
var getErr error
331333
containerIf, getErr = client.netioshim.GetNetworkInterfaceByName(client.containerVethName)
332334
return errors.Wrap(getErr, "failed to get container veth")
333-
}, numRetries, sleepInMs)
335+
})
334336
if err != nil {
335337
return errors.Wrap(err, "container veth does not exist")
336338
}
@@ -673,6 +675,17 @@ func (client *TransparentVlanEndpointClient) DeleteEndpointsImpl(ep *endpoint, _
673675
return nil
674676
}
675677

678+
// Creates a new retrier with a fixed delay, and treats all errors as retriable
679+
func (client *TransparentVlanEndpointClient) Retry(f func() error) error {
680+
retrier := retry.Retrier{
681+
Cooldown: retry.Max(numRetries, retry.Fixed(sleepDelay)),
682+
}
683+
return retrier.Do(context.Background(), func() error {
684+
// we always want to retry, so all errors are temporary errors
685+
return retry.WrapTemporaryError(f())
686+
})
687+
}
688+
676689
// Helper function that allows executing a function in a VM namespace
677690
// Does not work for process namespaces
678691
func ExecuteInNS(nsc NamespaceClientInterface, nsName string, f func() error) error {

network/transparent_vlan_endpointclient_linux_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ func TestTransparentVlanAddEndpoints(t *testing.T) {
190190
err := tt.client.setLinkNetNSAndConfirm(tt.client.vlanIfName, 1)
191191
if tt.wantErr {
192192
require.Error(t, err)
193-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
193+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
194194
} else {
195195
require.NoError(t, err)
196196
}
@@ -287,7 +287,7 @@ func TestTransparentVlanAddEndpoints(t *testing.T) {
287287
err := tt.client.ensureCleanPopulateVM()
288288
if tt.wantErr {
289289
require.Error(t, err)
290-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
290+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
291291
} else {
292292
require.NoError(t, err)
293293
}
@@ -429,7 +429,7 @@ func TestTransparentVlanAddEndpoints(t *testing.T) {
429429
},
430430
epInfo: &EndpointInfo{},
431431
wantErr: true,
432-
wantErrMsg: "container veth does not exist: failed to get container veth: B1veth0: " + errMockNetIOFail.Error() + "",
432+
wantErrMsg: "failed to get container veth: B1veth0: " + errMockNetIOFail.Error() + "",
433433
},
434434
{
435435
name: "Add endpoints NetNS Get fail",
@@ -489,7 +489,7 @@ func TestTransparentVlanAddEndpoints(t *testing.T) {
489489
err := tt.client.PopulateVM(tt.epInfo)
490490
if tt.wantErr {
491491
require.Error(t, err)
492-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
492+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
493493
} else {
494494
require.NoError(t, err)
495495
}
@@ -579,7 +579,7 @@ func TestTransparentVlanAddEndpoints(t *testing.T) {
579579
err := tt.client.PopulateVnet(tt.epInfo)
580580
if tt.wantErr {
581581
require.Error(t, err)
582-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
582+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
583583
} else {
584584
require.NoError(t, err)
585585
}
@@ -695,7 +695,7 @@ func TestTransparentVlanDeleteEndpoints(t *testing.T) {
695695
err := tt.client.DeleteEndpointsImpl(tt.ep, tt.routesLeft)
696696
if tt.wantErr {
697697
require.Error(t, err)
698-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
698+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
699699
} else {
700700
require.NoError(t, err)
701701
}
@@ -830,7 +830,7 @@ func TestTransparentVlanConfigureContainerInterfacesAndRoutes(t *testing.T) {
830830
err := tt.client.ConfigureContainerInterfacesAndRoutesImpl(tt.epInfo)
831831
if tt.wantErr {
832832
require.Error(t, err)
833-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
833+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
834834
} else {
835835
require.NoError(t, err)
836836
}
@@ -901,7 +901,7 @@ func TestTransparentVlanConfigureContainerInterfacesAndRoutes(t *testing.T) {
901901
err := tt.client.ConfigureVnetInterfacesAndRoutesImpl(tt.epInfo)
902902
if tt.wantErr {
903903
require.Error(t, err)
904-
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v actual:%v", tt.wantErrMsg, err.Error())
904+
require.Contains(t, err.Error(), tt.wantErrMsg, "Expected:%v \nActual:%v", tt.wantErrMsg, err.Error())
905905
} else {
906906
require.NoError(t, err)
907907
}

nmagent/client.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"time"
1313

1414
"github.com/Azure/azure-container-networking/nmagent/internal"
15+
"github.com/Azure/azure-container-networking/retry"
1516
"github.com/pkg/errors"
1617
)
1718

@@ -30,9 +31,9 @@ func NewClient(c Config) (*Client, error) {
3031
host: c.Host,
3132
port: c.Port,
3233
enableTLS: c.UseTLS,
33-
retrier: internal.Retrier{
34+
retrier: retry.Retrier{
3435
// nolint:gomnd // the base parameter is explained in the function
35-
Cooldown: internal.Exponential(1*time.Second, 2),
36+
Cooldown: retry.Exponential(1*time.Second, 2),
3637
},
3738
}
3839

nmagent/client_helpers_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"net/http"
55

66
"github.com/Azure/azure-container-networking/nmagent/internal"
7+
"github.com/Azure/azure-container-networking/retry"
78
)
89

910
// NewTestClient is a factory function available in tests only for creating
@@ -17,8 +18,8 @@ func NewTestClient(transport http.RoundTripper) *Client {
1718
},
1819
host: "localhost",
1920
port: 12345,
20-
retrier: internal.Retrier{
21-
Cooldown: internal.AsFastAsPossible(),
21+
retrier: retry.Retrier{
22+
Cooldown: retry.AsFastAsPossible(),
2223
},
2324
}
2425
}

0 commit comments

Comments
 (0)