@@ -237,6 +237,9 @@ func (p *HyperlightDevicePlugin) GetPreferredAllocation(ctx context.Context, req
237237}
238238
239239func (p * HyperlightDevicePlugin ) Start () error {
240+ // Reset stop channel for restart scenarios
241+ p .stopCh = make (chan struct {})
242+
240243 // Remove old socket
241244 if err := os .Remove (serverSock ); err != nil && ! os .IsNotExist (err ) {
242245 log .Printf ("Warning: failed to remove old socket: %v" , err )
@@ -307,6 +310,29 @@ func (p *HyperlightDevicePlugin) Stop() {
307310 log .Println ("Device plugin stopped" )
308311}
309312
313+ // watchKubeletRestart monitors for kubelet restarts by watching the plugin socket.
314+ // When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/.
315+ // This function blocks until the socket is deleted, signaling a kubelet restart.
316+ func (p * HyperlightDevicePlugin ) watchKubeletRestart () {
317+ log .Println ("Watching for kubelet restart (socket deletion)..." )
318+
319+ ticker := time .NewTicker (time .Second )
320+ defer ticker .Stop ()
321+
322+ for {
323+ select {
324+ case <- p .stopCh :
325+ return
326+ case <- ticker .C :
327+ // Check if our socket still exists
328+ if _ , err := os .Stat (serverSock ); os .IsNotExist (err ) {
329+ log .Println ("Plugin socket deleted - kubelet may have restarted" )
330+ return
331+ }
332+ }
333+ }
334+ }
335+
310336func main () {
311337 log .SetFlags (log .LstdFlags | log .Lshortfile )
312338 log .Println ("Starting Hyperlight Device Plugin" )
@@ -316,14 +342,30 @@ func main() {
316342 log .Fatalf ("Failed to create device plugin: %v" , err )
317343 }
318344
319- if err := plugin .Start (); err != nil {
320- log .Fatalf ("Failed to start device plugin: %v" , err )
321- }
322-
323345 // Handle signals for graceful shutdown
324346 sigCh := make (chan os.Signal , 1 )
325347 signal .Notify (sigCh , syscall .SIGINT , syscall .SIGTERM )
326348
349+ // Start plugin with restart handling
350+ go func () {
351+ for {
352+ if err := plugin .Start (); err != nil {
353+ log .Printf ("Failed to start device plugin: %v" , err )
354+ time .Sleep (5 * time .Second )
355+ continue
356+ }
357+
358+ // Watch for kubelet restart (socket deletion)
359+ // When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/
360+ plugin .watchKubeletRestart ()
361+
362+ // If we get here, kubelet restarted - stop current server and re-register
363+ log .Println ("Detected kubelet restart, re-registering..." )
364+ plugin .server .Stop ()
365+ time .Sleep (time .Second ) // Brief pause before restart
366+ }
367+ }()
368+
327369 sig := <- sigCh
328370 log .Printf ("Received signal %v, shutting down" , sig )
329371 plugin .Stop ()
0 commit comments