Skip to content

Commit 3c81c55

Browse files
committed
nfd-master: fix memory leak when leader election is enabled
Fix a serious memory leak of non-leaders that was caused by bad chan usage. With leader election enabled, the nfdAPIUpdateHandler() was not started for non-leader instances, and thus, there was no reader for the chans that the nfdController uses to communicate what objects to update. This, in turn caused blocking in the kubernetes API informer context (trying to queue data into the chan), piling up requests on each NodeFeature update, consuming more and more memory (that would not be released unless we became the leader). (cherry picked from commit a595439)
1 parent 90e8cee commit 3c81c55

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

pkg/nfd-master/nfd-master.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ type nfdMaster struct {
161161
updaterPool *updaterPool
162162
deniedNs
163163
config *NFDConfig
164+
165+
// isLeader indicates if this instance is the leader, changing dynamically
166+
isLeader bool
164167
}
165168

166169
// NewNfdMaster creates a new NfdMaster server instance.
@@ -344,10 +347,11 @@ func (m *nfdMaster) Run() error {
344347
// Run updater that handles events from the nfd CRD API.
345348
if m.nfdController != nil {
346349
if m.args.EnableLeaderElection {
347-
go m.nfdAPIUpdateHandlerWithLeaderElection()
350+
go m.startLeaderElectionHandler()
348351
} else {
349-
go m.nfdAPIUpdateHandler()
352+
m.isLeader = true
350353
}
354+
go m.nfdAPIUpdateHandler()
351355
}
352356

353357
// Start gRPC server for liveness probe (at this point we're "live")
@@ -506,6 +510,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
506510
case nodeFeatureGroupName := <-m.nfdController.updateNodeFeatureGroupChan:
507511
nodeFeatureGroup[nodeFeatureGroupName] = struct{}{}
508512
case <-rateLimit:
513+
// If we're not the leader, don't do anything, sleep a bit longer
514+
if !m.isLeader {
515+
rateLimit = time.After(5 * time.Second)
516+
break
517+
}
518+
509519
// NodeFeature
510520
errUpdateAll := false
511521
if updateAll {
@@ -1522,7 +1532,7 @@ func (m *nfdMaster) startNfdApiController() error {
15221532
return nil
15231533
}
15241534

1525-
func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
1535+
func (m *nfdMaster) startLeaderElectionHandler() {
15261536
ctx := context.Background()
15271537
lock := &resourcelock.LeaseLock{
15281538
LeaseMeta: metav1.ObjectMeta{
@@ -1543,11 +1553,15 @@ func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
15431553
RenewDeadline: m.config.LeaderElection.RenewDeadline.Duration,
15441554
Callbacks: leaderelection.LeaderCallbacks{
15451555
OnStartedLeading: func(_ context.Context) {
1546-
m.nfdAPIUpdateHandler()
1556+
m.isLeader = true
15471557
},
15481558
OnStoppedLeading: func() {
15491559
// We lost the lock.
15501560
klog.InfoS("leaderelection lock was lost")
1561+
// We stop (i.e. exit), makes sure that in-flight
1562+
// requests/re-tries will be stopped TODO: more graceful
1563+
// handling that does not exit the pod (set m.isLeader to false
1564+
// and flush updater queue and flush updater queues...)
15511565
m.Stop()
15521566
},
15531567
},

0 commit comments

Comments
 (0)