@@ -190,19 +190,19 @@ func (p *HyperlightDevicePlugin) ListAndWatch(req *pluginapi.Empty, srv pluginap
190190 case <- p .stopCh :
191191 return nil
192192 case <- ticker .C :
193- newHealth := pluginapi .Healthy
193+ health := pluginapi .Healthy
194194 if _ , err := os .Stat (p .devicePath ); err != nil {
195- newHealth = pluginapi .Unhealthy
195+ health = pluginapi .Unhealthy
196196 klog .Warningf ("Device %s not found, marking all devices unhealthy" , p .devicePath )
197197 }
198198
199199 // Check if health changed (compare against first device as representative)
200- if p .devices [0 ].Health != newHealth {
200+ if p .devices [0 ].Health != health {
201201 // Update ALL devices - they all share the same underlying hypervisor device
202202 for i := range p .devices {
203- p .devices [i ].Health = newHealth
203+ p .devices [i ].Health = health
204204 }
205- klog .Infof ("Device health changed to %s for all %d devices" , newHealth , len (p .devices ))
205+ klog .Infof ("Device health changed to %s for all %d devices" , health , len (p .devices ))
206206 if err := srv .Send (& pluginapi.ListAndWatchResponse {Devices : p .devices }); err != nil {
207207 return err
208208 }
@@ -317,7 +317,6 @@ func (p *HyperlightDevicePlugin) Stop() {
317317}
318318
319319// newFSWatcher creates a filesystem watcher for kubelet restart detection.
320- // This is the industry-standard approach used by NVIDIA, Intel, and other device plugins.
321320func newFSWatcher (files ... string ) (* fsnotify.Watcher , error ) {
322321 watcher , err := fsnotify .NewWatcher ()
323322 if err != nil {
@@ -336,7 +335,7 @@ func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
336335
337336// watchKubeletRestart monitors for kubelet restarts using fsnotify.
338337// When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/.
339- // This function blocks until it detects a relevant filesystem event .
338+ // This function blocks until it detects our plugin socket being deleted .
340339func (p * HyperlightDevicePlugin ) watchKubeletRestart () {
341340 klog .Info ("Watching for kubelet restart using fsnotify..." )
342341
@@ -352,17 +351,22 @@ func (p *HyperlightDevicePlugin) watchKubeletRestart() {
352351 select {
353352 case <- p .stopCh :
354353 return
355- case event := <- watcher .Events :
354+ case event , ok := <- watcher .Events :
355+ if ! ok {
356+ klog .Warning ("fsnotify events channel closed, falling back to polling" )
357+ p .watchKubeletRestartPolling ()
358+ return
359+ }
356360 if event .Name == serverSock && (event .Op & fsnotify .Remove ) == fsnotify .Remove {
357361 klog .Info ("Plugin socket deleted - kubelet may have restarted" )
358362 return
359363 }
360- // Also watch for kubelet socket recreation (indicates kubelet restart complete)
361- if event .Name == kubeletSock && (event .Op & fsnotify .Create ) == fsnotify .Create {
362- klog .Info ("Kubelet socket recreated - kubelet restart detected" )
364+ case err , ok := <- watcher .Errors :
365+ if ! ok {
366+ klog .Warning ("fsnotify errors channel closed, falling back to polling" )
367+ p .watchKubeletRestartPolling ()
363368 return
364369 }
365- case err := <- watcher .Errors :
366370 klog .Warningf ("fsnotify error: %v" , err )
367371 }
368372 }
0 commit comments