Skip to content

Commit 11401f6

Browse files
committed
nfd-master: fix memory leak when leader election is enabled
Fix a serious memory leak of non-leaders that was caused by bad chan usage. With leader election enabled, the nfdAPIUpdateHandler() was not started for non-leader instances, and thus, there was no reader for the chans that the nfdController uses to communicate what objects to update. This, in turn caused blocking in the kubernetes API informer context (trying to queue data into the chan), piling up requests on each NodeFeature update, consuming more and more memory (that would not be released unless we became the leader). (cherry picked from commit a595439)
1 parent 8bf4553 commit 11401f6

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

pkg/nfd-master/nfd-master.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ type nfdMaster struct {
159159
updaterPool *updaterPool
160160
deniedNs
161161
config *NFDConfig
162+
163+
// isLeader indicates if this instance is the leader, changing dynamically
164+
isLeader bool
162165
}
163166

164167
// NewNfdMaster creates a new NfdMaster server instance.
@@ -318,10 +321,11 @@ func (m *nfdMaster) Run() error {
318321
// Run updater that handles events from the nfd CRD API.
319322
if m.nfdController != nil {
320323
if m.args.EnableLeaderElection {
321-
go m.nfdAPIUpdateHandlerWithLeaderElection()
324+
go m.startLeaderElectionHandler()
322325
} else {
323-
go m.nfdAPIUpdateHandler()
326+
m.isLeader = true
324327
}
328+
go m.nfdAPIUpdateHandler()
325329
}
326330

327331
// Start gRPC server for liveness probe (at this point we're "live")
@@ -394,6 +398,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
394398
case nodeFeatureGroupName := <-m.nfdController.updateNodeFeatureGroupChan:
395399
nodeFeatureGroup[nodeFeatureGroupName] = struct{}{}
396400
case <-rateLimit:
401+
// If we're not the leader, don't do anything, sleep a bit longer
402+
if !m.isLeader {
403+
rateLimit = time.After(5 * time.Second)
404+
break
405+
}
406+
397407
// NodeFeature
398408
errUpdateAll := false
399409
if updateAll {
@@ -1359,7 +1369,7 @@ func (m *nfdMaster) startNfdApiController() error {
13591369
return nil
13601370
}
13611371

1362-
func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
1372+
func (m *nfdMaster) startLeaderElectionHandler() {
13631373
ctx := context.Background()
13641374
lock := &resourcelock.LeaseLock{
13651375
LeaseMeta: metav1.ObjectMeta{
@@ -1380,11 +1390,15 @@ func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
13801390
RenewDeadline: m.config.LeaderElection.RenewDeadline.Duration,
13811391
Callbacks: leaderelection.LeaderCallbacks{
13821392
OnStartedLeading: func(_ context.Context) {
1383-
m.nfdAPIUpdateHandler()
1393+
m.isLeader = true
13841394
},
13851395
OnStoppedLeading: func() {
13861396
// We lost the lock.
13871397
klog.InfoS("leaderelection lock was lost")
1398+
// We stop (i.e. exit), makes sure that in-flight
1399+
// requests/re-tries will be stopped TODO: more graceful
1400+
// handling that does not exit the pod (set m.isLeader to false
1401+
// and flush updater queue and flush updater queues...)
13881402
m.Stop()
13891403
},
13901404
},

0 commit comments

Comments
 (0)