Skip to content

Commit e20492f

Browse files
dougbtvsqueed
authored andcommitted
DHCP lease maintenance should terminate when interface no longer exists.
Due to oberservations that threads can grow and the dhcp daemon uses an increasing amount of memory. This situation can happen organically when using say, bridge CNI, and the bridge has been removed outside of the bridge CNI lifecycle, and an interface no longer exists on a pod. Does so on a retry loop using the `backoffRetry()` method. Signed-off-by: dougbtv <dosmith@redhat.com>
1 parent 3c224f5 commit e20492f

File tree

1 file changed

+51
-2
lines changed

1 file changed

+51
-2
lines changed

plugins/ipam/dhcp/lease.go

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package main
1616

1717
import (
1818
"context"
19+
"errors"
1920
"fmt"
2021
"log"
2122
"math/rand"
@@ -55,6 +56,13 @@ const (
5556
leaseStateRebinding
5657
)
5758

59+
// Timing for retrying link existence check
60+
const (
61+
linkCheckDelay0 = 1 * time.Second
62+
linkCheckRetryMax = 10 * time.Second
63+
linkCheckTotalTimeout = 30 * time.Second
64+
)
65+
5866
// This implementation uses 1 OS thread per lease. This is because
5967
// all the network operations have to be done in network namespace
6068
// of the interface. This can be improved by switching to the proper
@@ -65,6 +73,7 @@ type DHCPLease struct {
6573
clientID string
6674
latestLease *nclient4.Lease
6775
link netlink.Link
76+
linkName string
6877
renewalTime time.Time
6978
rebindingTime time.Time
7079
expireTime time.Time
@@ -190,6 +199,7 @@ func AcquireLease(
190199
}
191200

192201
l.link = link
202+
l.linkName = link.Attrs().Name
193203

194204
if err = l.acquire(); err != nil {
195205
return err
@@ -243,7 +253,7 @@ func withAllOptions(l *DHCPLease) dhcp4.Modifier {
243253

244254
func (l *DHCPLease) acquire() error {
245255
if (l.link.Attrs().Flags & net.FlagUp) != net.FlagUp {
246-
log.Printf("Link %q down. Attempting to set up", l.link.Attrs().Name)
256+
log.Printf("Link %q down. Attempting to set up", l.linkName)
247257
if err := netlink.LinkSetUp(l.link); err != nil {
248258
return err
249259
}
@@ -292,6 +302,14 @@ func (l *DHCPLease) maintain() {
292302
for {
293303
var sleepDur time.Duration
294304

305+
linkCheckCtx, cancel := context.WithTimeoutCause(l.ctx, l.resendTimeout, errNoMoreTries)
306+
defer cancel()
307+
linkExists, _ := checkLinkExistsWithBackoff(linkCheckCtx, l.linkName)
308+
if !linkExists {
309+
log.Printf("%v: interface %s no longer exists or link check failed, terminating lease maintenance", l.clientID, l.linkName)
310+
return
311+
}
312+
295313
switch state {
296314
case leaseStateBound:
297315
sleepDur = time.Until(l.renewalTime)
@@ -344,9 +362,40 @@ func (l *DHCPLease) maintain() {
344362
}
345363
}
346364

365+
func checkLinkExistsWithBackoff(ctx context.Context, linkName string) (bool, error) {
366+
baseDelay := linkCheckDelay0
367+
for {
368+
exists, err := checkLinkByName(linkName)
369+
if err == nil {
370+
return exists, nil
371+
}
372+
373+
select {
374+
case <-ctx.Done():
375+
return false, ctx.Err() // Context's done, return with its error
376+
case <-time.After(baseDelay):
377+
if baseDelay < linkCheckRetryMax {
378+
baseDelay *= 2
379+
}
380+
}
381+
}
382+
}
383+
384+
func checkLinkByName(linkName string) (bool, error) {
385+
_, err := netlink.LinkByName(linkName)
386+
if err != nil {
387+
var linkNotFoundErr *netlink.LinkNotFoundError = &netlink.LinkNotFoundError{}
388+
if errors.As(err, linkNotFoundErr) {
389+
return false, nil
390+
}
391+
return false, err
392+
}
393+
return true, nil
394+
}
395+
347396
func (l *DHCPLease) downIface() {
348397
if err := netlink.LinkSetDown(l.link); err != nil {
349-
log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.link.Attrs().Name, err)
398+
log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.linkName, err)
350399
}
351400
}
352401

0 commit comments

Comments
 (0)