Skip to content

Commit bd2370d

Browse files
author
Kubernetes Submit Queue
authored
Merge pull request kubernetes#68084 from bowei/backoff-node-ipam
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions here: https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md. Make CIDR allocation retry backoff exponentially This also sets to the retry time to be less aggressive fixes kubernetes#67348 ```release-note NONE ```
2 parents 90fed8e + d3facac commit bd2370d

File tree

3 files changed

+57
-10
lines changed

3 files changed

+57
-10
lines changed

pkg/controller/nodeipam/ipam/cidr_allocator.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,10 @@ const (
7171
cidrUpdateRetries = 3
7272

7373
// updateRetryTimeout is the time to wait before requeing a failed node for retry
74-
updateRetryTimeout = 100 * time.Millisecond
74+
updateRetryTimeout = 250 * time.Millisecond
75+
76+
// maxUpdateRetryTimeout is the maximum amount of time between timeouts.
77+
maxUpdateRetryTimeout = 5 * time.Second
7578

7679
// updateMaxRetries is the max retries for a failed node
7780
updateMaxRetries = 10

pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/golang/glog"
2626

27+
"k8s.io/api/core/v1"
2728
"k8s.io/apimachinery/pkg/api/errors"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/types"
@@ -33,7 +34,6 @@ import (
3334
"k8s.io/client-go/tools/cache"
3435
"k8s.io/client-go/tools/record"
3536

36-
"k8s.io/api/core/v1"
3737
clientset "k8s.io/client-go/kubernetes"
3838
"k8s.io/client-go/kubernetes/scheme"
3939
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -155,16 +155,21 @@ func (ca *cloudCIDRAllocator) worker(stopChan <-chan struct{}) {
155155
glog.Warning("Channel nodeCIDRUpdateChannel was unexpectedly closed")
156156
return
157157
}
158-
if err := ca.updateCIDRAllocation(workItem); err != nil {
159-
if ca.canRetry(workItem) {
160-
time.AfterFunc(updateRetryTimeout, func() {
158+
if err := ca.updateCIDRAllocation(workItem); err == nil {
159+
glog.V(3).Infof("Updated CIDR for %q", workItem)
160+
ca.removeNodeFromProcessing(workItem)
161+
} else {
162+
glog.Errorf("Error updating CIDR for %q: %v", workItem, err)
163+
if canRetry, timeout := ca.retryParams(workItem); canRetry {
164+
glog.V(2).Infof("Retrying update for %q after %v", workItem, timeout)
165+
time.AfterFunc(timeout, func() {
161166
// Requeue the failed node for update again.
162167
ca.nodeUpdateChannel <- workItem
163168
})
164169
continue
165170
}
171+
glog.Errorf("Exceeded retry count for %q, dropping from queue", workItem)
166172
}
167-
ca.removeNodeFromProcessing(workItem)
168173
case <-stopChan:
169174
return
170175
}
@@ -181,15 +186,34 @@ func (ca *cloudCIDRAllocator) insertNodeToProcessing(nodeName string) bool {
181186
return true
182187
}
183188

184-
func (ca *cloudCIDRAllocator) canRetry(nodeName string) bool {
189+
func (ca *cloudCIDRAllocator) retryParams(nodeName string) (bool, time.Duration) {
185190
ca.lock.Lock()
186191
defer ca.lock.Unlock()
187-
count := ca.nodesInProcessing[nodeName].retries + 1
192+
193+
entry, ok := ca.nodesInProcessing[nodeName]
194+
if !ok {
195+
glog.Errorf("Cannot get retryParams for %q as entry does not exist", nodeName)
196+
return false, 0
197+
}
198+
199+
count := entry.retries + 1
188200
if count > updateMaxRetries {
189-
return false
201+
return false, 0
190202
}
191203
ca.nodesInProcessing[nodeName].retries = count
192-
return true
204+
205+
return true, nodeUpdateRetryTimeout(count)
206+
}
207+
208+
func nodeUpdateRetryTimeout(count int) time.Duration {
209+
timeout := updateRetryTimeout
210+
for i := 0; i < count && timeout < maxUpdateRetryTimeout; i++ {
211+
timeout *= 2
212+
}
213+
if timeout > maxUpdateRetryTimeout {
214+
return maxUpdateRetryTimeout
215+
}
216+
return timeout
193217
}
194218

195219
func (ca *cloudCIDRAllocator) removeNodeFromProcessing(nodeName string) {

pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package ipam
1818

1919
import (
20+
"fmt"
2021
"testing"
2122
"time"
2223

@@ -57,3 +58,22 @@ func TestBoundedRetries(t *testing.T) {
5758
// wait for node to finish processing (should terminate and not time out)
5859
}
5960
}
61+
62+
func TestNodeUpdateRetryTimeout(t *testing.T) {
63+
for _, tc := range []struct {
64+
count int
65+
want time.Duration
66+
}{
67+
{count: 0, want: 250 * time.Millisecond},
68+
{count: 1, want: 500 * time.Millisecond},
69+
{count: 2, want: 1000 * time.Millisecond},
70+
{count: 3, want: 2000 * time.Millisecond},
71+
{count: 50, want: 5000 * time.Millisecond},
72+
} {
73+
t.Run(fmt.Sprintf("count %d", tc.count), func(t *testing.T) {
74+
if got := nodeUpdateRetryTimeout(tc.count); got != tc.want {
75+
t.Errorf("nodeUpdateRetryTimeout(tc.count) = %v; want %v", got, tc.want)
76+
}
77+
})
78+
}
79+
}

0 commit comments

Comments
 (0)