Skip to content

Commit ff6cc35

Browse files
authored
handle kubelet restart (#4)
Signed-off-by: Simon Davies <[email protected]>
1 parent 1a7908e commit ff6cc35

File tree

1 file changed

+46
-4
lines changed

1 file changed

+46
-4
lines changed

device-plugin/main.go

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,9 @@ func (p *HyperlightDevicePlugin) GetPreferredAllocation(ctx context.Context, req
237237
}
238238

239239
func (p *HyperlightDevicePlugin) Start() error {
240+
// Reset stop channel for restart scenarios
241+
p.stopCh = make(chan struct{})
242+
240243
// Remove old socket
241244
if err := os.Remove(serverSock); err != nil && !os.IsNotExist(err) {
242245
log.Printf("Warning: failed to remove old socket: %v", err)
@@ -307,6 +310,29 @@ func (p *HyperlightDevicePlugin) Stop() {
307310
log.Println("Device plugin stopped")
308311
}
309312

313+
// watchKubeletRestart monitors for kubelet restarts by watching the plugin socket.
314+
// When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/.
315+
// This function blocks until the socket is deleted, signaling a kubelet restart.
316+
func (p *HyperlightDevicePlugin) watchKubeletRestart() {
317+
log.Println("Watching for kubelet restart (socket deletion)...")
318+
319+
ticker := time.NewTicker(time.Second)
320+
defer ticker.Stop()
321+
322+
for {
323+
select {
324+
case <-p.stopCh:
325+
return
326+
case <-ticker.C:
327+
// Check if our socket still exists
328+
if _, err := os.Stat(serverSock); os.IsNotExist(err) {
329+
log.Println("Plugin socket deleted - kubelet may have restarted")
330+
return
331+
}
332+
}
333+
}
334+
}
335+
310336
func main() {
311337
log.SetFlags(log.LstdFlags | log.Lshortfile)
312338
log.Println("Starting Hyperlight Device Plugin")
@@ -316,14 +342,30 @@ func main() {
316342
log.Fatalf("Failed to create device plugin: %v", err)
317343
}
318344

319-
if err := plugin.Start(); err != nil {
320-
log.Fatalf("Failed to start device plugin: %v", err)
321-
}
322-
323345
// Handle signals for graceful shutdown
324346
sigCh := make(chan os.Signal, 1)
325347
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
326348

349+
// Start plugin with restart handling
350+
go func() {
351+
for {
352+
if err := plugin.Start(); err != nil {
353+
log.Printf("Failed to start device plugin: %v", err)
354+
time.Sleep(5 * time.Second)
355+
continue
356+
}
357+
358+
// Watch for kubelet restart (socket deletion)
359+
// When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/
360+
plugin.watchKubeletRestart()
361+
362+
// If we get here, kubelet restarted - stop current server and re-register
363+
log.Println("Detected kubelet restart, re-registering...")
364+
plugin.server.Stop()
365+
time.Sleep(time.Second) // Brief pause before restart
366+
}
367+
}()
368+
327369
sig := <-sigCh
328370
log.Printf("Received signal %v, shutting down", sig)
329371
plugin.Stop()

0 commit comments

Comments
 (0)