-
Notifications
You must be signed in to change notification settings - Fork 0
handle kubelet restart #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -237,6 +237,9 @@ func (p *HyperlightDevicePlugin) GetPreferredAllocation(ctx context.Context, req | |||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| func (p *HyperlightDevicePlugin) Start() error { | ||||||||||||||
| // Reset stop channel for restart scenarios | ||||||||||||||
| p.stopCh = make(chan struct{}) | ||||||||||||||
|
|
||||||||||||||
| // Remove old socket | ||||||||||||||
| if err := os.Remove(serverSock); err != nil && !os.IsNotExist(err) { | ||||||||||||||
| log.Printf("Warning: failed to remove old socket: %v", err) | ||||||||||||||
|
|
@@ -307,6 +310,29 @@ func (p *HyperlightDevicePlugin) Stop() { | |||||||||||||
| log.Println("Device plugin stopped") | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // watchKubeletRestart monitors for kubelet restarts by watching the plugin socket. | ||||||||||||||
| // When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/. | ||||||||||||||
| // This function blocks until the socket is deleted, signaling a kubelet restart. | ||||||||||||||
| func (p *HyperlightDevicePlugin) watchKubeletRestart() { | ||||||||||||||
| log.Println("Watching for kubelet restart (socket deletion)...") | ||||||||||||||
|
|
||||||||||||||
| ticker := time.NewTicker(time.Second) | ||||||||||||||
| defer ticker.Stop() | ||||||||||||||
|
|
||||||||||||||
| for { | ||||||||||||||
| select { | ||||||||||||||
| case <-p.stopCh: | ||||||||||||||
| return | ||||||||||||||
| case <-ticker.C: | ||||||||||||||
| // Check if our socket still exists | ||||||||||||||
| if _, err := os.Stat(serverSock); os.IsNotExist(err) { | ||||||||||||||
| log.Println("Plugin socket deleted - kubelet may have restarted") | ||||||||||||||
| return | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
Comment on lines
+319
to
+333
|
||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| func main() { | ||||||||||||||
| log.SetFlags(log.LstdFlags | log.Lshortfile) | ||||||||||||||
| log.Println("Starting Hyperlight Device Plugin") | ||||||||||||||
|
|
@@ -316,14 +342,30 @@ func main() { | |||||||||||||
| log.Fatalf("Failed to create device plugin: %v", err) | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| if err := plugin.Start(); err != nil { | ||||||||||||||
| log.Fatalf("Failed to start device plugin: %v", err) | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // Handle signals for graceful shutdown | ||||||||||||||
| sigCh := make(chan os.Signal, 1) | ||||||||||||||
| signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) | ||||||||||||||
|
|
||||||||||||||
| // Start plugin with restart handling | ||||||||||||||
| go func() { | ||||||||||||||
| for { | ||||||||||||||
| if err := plugin.Start(); err != nil { | ||||||||||||||
| log.Printf("Failed to start device plugin: %v", err) | ||||||||||||||
| time.Sleep(5 * time.Second) | ||||||||||||||
| continue | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // Watch for kubelet restart (socket deletion) | ||||||||||||||
| // When kubelet restarts, it deletes all sockets in /var/lib/kubelet/device-plugins/ | ||||||||||||||
| plugin.watchKubeletRestart() | ||||||||||||||
|
|
||||||||||||||
| // If we get here, kubelet restarted - stop current server and re-register | ||||||||||||||
| log.Println("Detected kubelet restart, re-registering...") | ||||||||||||||
| plugin.server.Stop() | ||||||||||||||
|
Comment on lines
+362
to
+364
|
||||||||||||||
| // If we get here, kubelet restarted - stop current server and re-register | |
| log.Println("Detected kubelet restart, re-registering...") | |
| plugin.server.Stop() | |
| // If we get here, kubelet restarted - stop current plugin and re-register | |
| log.Println("Detected kubelet restart, re-registering...") | |
| plugin.Stop() |
Copilot
AI
Jan 7, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The restart loop does not properly handle the case where watchKubeletRestart() returns due to stopCh being closed (during graceful shutdown). When plugin.Stop() is called in line 371, it closes stopCh, causing watchKubeletRestart() to return. The loop then calls plugin.server.Stop() and attempts to restart, but the program is already in shutdown mode. The loop should check if it's in shutdown mode after watchKubeletRestart() returns.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Recreating the stop channel in Start() creates a race condition. The watchKubeletRestart() goroutine and ListAndWatch() function may be reading from the old stopCh while a new one is created here. When Stop() closes the old channel, these goroutines will still be blocked on the new channel. This causes the plugin to not shut down properly on restart cycles.