Skip to content

Commit f48b62a

Browse files
committed
add retry to nnc update during scaledown (#1970)
* add retry to nnc update during scaledown Signed-off-by: Evan Baker <[email protected]> * test for panic in pool monitor Signed-off-by: Evan Baker <[email protected]> --------- Signed-off-by: Evan Baker <[email protected]>
1 parent d3c8a65 commit f48b62a

File tree

14 files changed

+1238
-69
lines changed

14 files changed

+1238
-69
lines changed

cns/ipampool/monitor.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/Azure/azure-container-networking/cns/types"
1414
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
1515
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
16+
"github.com/avast/retry-go/v4"
1617
"github.com/pkg/errors"
1718
"github.com/prometheus/client_golang/prometheus"
1819
)
@@ -334,10 +335,20 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
334335
tempNNCSpec.RequestedIPCount -= int64(len(pendingIPAddresses))
335336
logger.Printf("[ipam-pool-monitor] Decreasing pool size, pool %+v, spec %+v", state, tempNNCSpec)
336337

337-
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
338-
if err != nil {
339-
// caller will retry to update the CRD again
340-
return errors.Wrap(err, "executing UpdateSpec with NNC CLI")
338+
attempts := 0
339+
if err := retry.Do(func() error {
340+
attempts++
341+
_, err := pm.nnccli.UpdateSpec(ctx, &tempNNCSpec)
342+
if err != nil {
343+
// caller will retry to update the CRD again
344+
logger.Printf("failed to update NNC spec attempt #%d, err: %v", attempts, err)
345+
return errors.Wrap(err, "executing UpdateSpec with NNC client")
346+
}
347+
logger.Printf("successfully updated NNC spec attempt #%d", attempts)
348+
return nil
349+
}, retry.Attempts(5), retry.DelayType(retry.BackOffDelay)); err != nil { //nolint:gomnd // ignore retry magic number
350+
logger.Errorf("all attempts failed to update NNC during scale-down, state is corrupt: %v", err)
351+
panic(err)
341352
}
342353

343354
logger.Printf("[ipam-pool-monitor] Decreasing pool size: UpdateCRDSpec succeeded for spec %+v", tempNNCSpec)
@@ -350,7 +361,6 @@ func (pm *Monitor) decreasePoolSize(ctx context.Context, meta metaState, state i
350361
// clear the updatingPendingIpsNotInUse, as we have Updated the CRD
351362
logger.Printf("[ipam-pool-monitor] cleaning the updatingPendingIpsNotInUse, existing length %d", pm.metastate.notInUseCount)
352363
pm.metastate.notInUseCount = 0
353-
354364
return nil
355365
}
356366

cns/ipampool/monitor_test.go

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package ipampool
22

33
import (
44
"context"
5+
"errors"
56
"testing"
67
"time"
78

@@ -21,6 +22,12 @@ func (f *fakeNodeNetworkConfigUpdater) UpdateSpec(ctx context.Context, spec *v1a
2122
return f.nnc, nil
2223
}
2324

25+
type fakeNodeNetworkConfigUpdaterFunc func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error)
26+
27+
func (f fakeNodeNetworkConfigUpdaterFunc) UpdateSpec(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
28+
return f(ctx, spec)
29+
}
30+
2431
type directUpdatePoolMonitor struct {
2532
m *Monitor
2633
cns.IPAMPoolMonitor
@@ -45,7 +52,7 @@ type testState struct {
4552
totalIPs int64
4653
}
4754

48-
func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
55+
func initFakes(state testState, nnccli nodeNetworkConfigSpecUpdater) (*fakes.HTTPServiceFake, *fakes.RequestControllerFake, *Monitor) {
4956
logger.InitLogger("testlogs", 0, 0, "./")
5057

5158
scalarUnits := v1alpha.Scaler{
@@ -61,8 +68,11 @@ func initFakes(state testState) (*fakes.HTTPServiceFake, *fakes.RequestControlle
6168
}
6269
fakecns := fakes.NewHTTPServiceFake()
6370
fakerc := fakes.NewRequestControllerFake(fakecns, scalarUnits, subnetaddresspace, state.totalIPs)
71+
if nnccli == nil {
72+
nnccli = &fakeNodeNetworkConfigUpdater{fakerc.NNC}
73+
}
6474

65-
poolmonitor := NewMonitor(fakecns, &fakeNodeNetworkConfigUpdater{fakerc.NNC}, nil, &Options{RefreshDelay: 100 * time.Second})
75+
poolmonitor := NewMonitor(fakecns, nnccli, nil, &Options{RefreshDelay: 100 * time.Second})
6676
poolmonitor.metastate = metaState{
6777
batch: state.batch,
6878
max: state.max,
@@ -127,7 +137,7 @@ func TestPoolSizeIncrease(t *testing.T) {
127137
for _, tt := range tests {
128138
tt := tt
129139
t.Run(tt.name, func(t *testing.T) {
130-
fakecns, fakerc, poolmonitor := initFakes(tt.in)
140+
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
131141
assert.NoError(t, fakerc.Reconcile(true))
132142

133143
// When poolmonitor reconcile is called, trigger increase and cache goal state
@@ -162,7 +172,7 @@ func TestPoolIncreaseDoesntChangeWhenIncreaseIsAlreadyInProgress(t *testing.T) {
162172
max: 30,
163173
}
164174

165-
fakecns, fakerc, poolmonitor := initFakes(initState)
175+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
166176
assert.NoError(t, fakerc.Reconcile(true))
167177

168178
// When poolmonitor reconcile is called, trigger increase and cache goal state
@@ -201,7 +211,7 @@ func TestPoolSizeIncreaseIdempotency(t *testing.T) {
201211
max: 30,
202212
}
203213

204-
_, fakerc, poolmonitor := initFakes(initState)
214+
_, fakerc, poolmonitor := initFakes(initState, nil)
205215
assert.NoError(t, fakerc.Reconcile(true))
206216

207217
// When poolmonitor reconcile is called, trigger increase and cache goal state
@@ -227,7 +237,7 @@ func TestPoolIncreasePastNodeLimit(t *testing.T) {
227237
max: 30,
228238
}
229239

230-
_, fakerc, poolmonitor := initFakes(initState)
240+
_, fakerc, poolmonitor := initFakes(initState, nil)
231241
assert.NoError(t, fakerc.Reconcile(true))
232242

233243
// When poolmonitor reconcile is called, trigger increase and cache goal state
@@ -247,7 +257,7 @@ func TestPoolIncreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
247257
max: 30,
248258
}
249259

250-
_, fakerc, poolmonitor := initFakes(initState)
260+
_, fakerc, poolmonitor := initFakes(initState, nil)
251261
assert.NoError(t, fakerc.Reconcile(true))
252262

253263
// When poolmonitor reconcile is called, trigger increase and cache goal state
@@ -267,7 +277,7 @@ func TestIncreaseWithPendingRelease(t *testing.T) {
267277
max: 250,
268278
pendingRelease: 16,
269279
}
270-
_, rc, mon := initFakes(initState)
280+
_, rc, mon := initFakes(initState, nil)
271281
assert.NoError(t, rc.Reconcile(true))
272282
assert.NoError(t, mon.reconcile(context.Background()))
273283
assert.Equal(t, int64(32), mon.spec.RequestedIPCount)
@@ -333,7 +343,7 @@ func TestPoolDecrease(t *testing.T) {
333343
for _, tt := range tests {
334344
tt := tt
335345
t.Run(tt.name, func(t *testing.T) {
336-
fakecns, fakerc, poolmonitor := initFakes(tt.in)
346+
fakecns, fakerc, poolmonitor := initFakes(tt.in, nil)
337347
assert.NoError(t, fakerc.Reconcile(true))
338348

339349
// Decrease the number of allocated IPs down to target. This may trigger a scale down.
@@ -374,7 +384,7 @@ func TestPoolSizeDecreaseWhenDecreaseHasAlreadyBeenRequested(t *testing.T) {
374384
max: 30,
375385
}
376386

377-
fakecns, fakerc, poolmonitor := initFakes(initState)
387+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
378388
assert.NoError(t, fakerc.Reconcile(true))
379389

380390
// Pool monitor does nothing, as the current number of IPs falls in the threshold
@@ -413,7 +423,7 @@ func TestDecreaseAndIncreaseToSameCount(t *testing.T) {
413423
max: 30,
414424
}
415425

416-
fakecns, fakerc, poolmonitor := initFakes(initState)
426+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
417427
assert.NoError(t, fakerc.Reconcile(true))
418428
assert.NoError(t, poolmonitor.reconcile(context.Background()))
419429
assert.EqualValues(t, 20, poolmonitor.spec.RequestedIPCount)
@@ -458,7 +468,7 @@ func TestPoolSizeDecreaseToReallyLow(t *testing.T) {
458468
max: 30,
459469
}
460470

461-
fakecns, fakerc, poolmonitor := initFakes(initState)
471+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
462472
assert.NoError(t, fakerc.Reconcile(true))
463473

464474
// Pool monitor does nothing, as the current number of IPs falls in the threshold
@@ -499,7 +509,7 @@ func TestDecreaseAfterNodeLimitReached(t *testing.T) {
499509
releaseThresholdPercent: 150,
500510
max: 30,
501511
}
502-
fakecns, fakerc, poolmonitor := initFakes(initState)
512+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
503513
assert.NoError(t, fakerc.Reconcile(true))
504514

505515
assert.NoError(t, poolmonitor.reconcile(context.Background()))
@@ -524,7 +534,7 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
524534
totalIPs: 64,
525535
max: 250,
526536
}
527-
fakecns, fakerc, poolmonitor := initFakes(initState)
537+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
528538
fakerc.NNC.Spec.RequestedIPCount = 48
529539
assert.NoError(t, fakerc.Reconcile(true))
530540

@@ -547,6 +557,35 @@ func TestDecreaseWithPendingRelease(t *testing.T) {
547557
assert.Len(t, poolmonitor.spec.IPsNotInUse, int(initState.batch)+int(initState.pendingRelease))
548558
}
549559

560+
func TestDecreaseWithAPIServerFailure(t *testing.T) {
561+
initState := testState{
562+
batch: 16,
563+
assigned: 46,
564+
allocated: 64,
565+
pendingRelease: 0,
566+
requestThresholdPercent: 50,
567+
releaseThresholdPercent: 150,
568+
totalIPs: 64,
569+
max: 250,
570+
}
571+
var errNNCCLi fakeNodeNetworkConfigUpdaterFunc = func(ctx context.Context, spec *v1alpha.NodeNetworkConfigSpec) (*v1alpha.NodeNetworkConfig, error) {
572+
return nil, errors.New("fake APIServer failure") //nolint:goerr113 // this is a fake error
573+
}
574+
575+
fakecns, fakerc, poolmonitor := initFakes(initState, errNNCCLi)
576+
fakerc.NNC.Spec.RequestedIPCount = initState.totalIPs
577+
assert.NoError(t, fakerc.Reconcile(true))
578+
579+
assert.NoError(t, poolmonitor.reconcile(context.Background()))
580+
581+
// release some IPs
582+
assert.NoError(t, fakecns.SetNumberOfAssignedIPs(40))
583+
// check that the pool monitor panics if it is not able to publish the updated NNC
584+
assert.Panics(t, func() {
585+
_ = poolmonitor.reconcile(context.Background())
586+
})
587+
}
588+
550589
func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
551590
initState := testState{
552591
batch: 31,
@@ -557,7 +596,7 @@ func TestPoolDecreaseBatchSizeGreaterThanMaxPodIPCount(t *testing.T) {
557596
max: 30,
558597
}
559598

560-
fakecns, fakerc, poolmonitor := initFakes(initState)
599+
fakecns, fakerc, poolmonitor := initFakes(initState, nil)
561600
assert.NoError(t, fakerc.Reconcile(true))
562601

563602
// When poolmonitor reconcile is called, trigger increase and cache goal state

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
github.com/Microsoft/go-winio v0.4.17
1111
github.com/Microsoft/hcsshim v0.8.23
1212
github.com/avast/retry-go/v3 v3.1.1
13+
github.com/avast/retry-go/v4 v4.3.4
1314
github.com/billgraziano/dpapi v0.4.0
1415
github.com/containernetworking/cni v1.1.2
1516
github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14
@@ -30,7 +31,7 @@ require (
3031
github.com/spf13/cobra v1.6.1
3132
github.com/spf13/pflag v1.0.5
3233
github.com/spf13/viper v1.14.0
33-
github.com/stretchr/testify v1.8.1
34+
github.com/stretchr/testify v1.8.2
3435
go.uber.org/zap v1.24.0
3536
golang.org/x/sys v0.3.0
3637
google.golang.org/grpc v1.52.0

go.sum

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
8989
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
9090
github.com/avast/retry-go/v3 v3.1.1 h1:49Scxf4v8PmiQ/nY0aY3p0hDueqSmc7++cBbtiDGu2g=
9191
github.com/avast/retry-go/v3 v3.1.1/go.mod h1:6cXRK369RpzFL3UQGqIUp9Q7GDrams+KsYWrfNA1/nQ=
92+
github.com/avast/retry-go/v4 v4.3.4 h1:pHLkL7jvCvP317I8Ge+Km2Yhntv3SdkJm7uekkqbKhM=
93+
github.com/avast/retry-go/v4 v4.3.4/go.mod h1:rv+Nla6Vk3/ilU0H51VHddWHiwimzX66yZ0JT6T+UvE=
9294
github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0=
9395
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
9496
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
@@ -741,8 +743,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
741743
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
742744
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
743745
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
744-
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
745-
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
746+
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
747+
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
746748
github.com/subosito/gotenv v1.4.1 h1:jyEFiXpy21Wm81FBN71l9VoMMV8H8jG+qIK3GCpY6Qs=
747749
github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
748750
github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=

vendor/github.com/avast/retry-go/v4/.gitignore

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/avast/retry-go/v4/.godocdown.tmpl

Lines changed: 38 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/avast/retry-go/v4/LICENSE

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)