Skip to content

Commit 86e835b

Browse files
committed
address pr comments
1 parent b231d1b commit 86e835b

File tree

3 files changed

+111
-3
lines changed

3 files changed

+111
-3
lines changed

cmd/api/main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ func run() error {
8181
}
8282
logger.Info("Network manager initialized")
8383

84+
// Reconcile device state (clears orphaned attachments from crashed VMs)
85+
logger.Info("Reconciling device state...")
86+
if err := app.DeviceManager.ReconcileDevices(app.Ctx); err != nil {
87+
logger.Error("failed to reconcile device state", "error", err)
88+
return fmt.Errorf("reconcile device state: %w", err)
89+
}
90+
8491
// Create router
8592
r := chi.NewRouter()
8693

lib/devices/manager.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ type Manager interface {
4242

4343
// MarkDetached marks a device as detached from an instance
4444
MarkDetached(ctx context.Context, deviceID string) error
45+
46+
// ReconcileDevices cleans up stale device state on startup.
47+
// It detects devices with AttachedTo referencing non-existent instances
48+
// and clears the orphaned attachment state.
49+
ReconcileDevices(ctx context.Context) error
4550
}
4651

4752
type manager struct {
@@ -59,6 +64,9 @@ func NewManager(p *paths.Paths) Manager {
5964
}
6065

6166
func (m *manager) ListDevices(ctx context.Context) ([]Device, error) {
67+
// RLock protects against concurrent directory modifications (CreateDevice/DeleteDevice)
68+
// during iteration. While individual file reads are atomic, directory iteration could
69+
// see inconsistent state if a device is being created or deleted concurrently.
6270
m.mu.RLock()
6371
defer m.mu.RUnlock()
6472

@@ -172,6 +180,8 @@ func (m *manager) CreateDevice(ctx context.Context, req CreateDeviceRequest) (*D
172180
}
173181

174182
func (m *manager) GetDevice(ctx context.Context, idOrName string) (*Device, error) {
183+
// RLock protects against concurrent modifications while looking up by name,
184+
// which requires iterating the devices directory.
175185
m.mu.RLock()
176186
defer m.mu.RUnlock()
177187

@@ -343,6 +353,79 @@ func (m *manager) MarkDetached(ctx context.Context, deviceID string) error {
343353
return m.saveDevice(device)
344354
}
345355

356+
// ReconcileDevices cleans up stale device state on startup.
357+
// It scans all registered devices and clears AttachedTo for any device
358+
// that references an instance that no longer exists.
359+
func (m *manager) ReconcileDevices(ctx context.Context) error {
360+
log := logger.FromContext(ctx)
361+
log.InfoContext(ctx, "reconciling device state")
362+
363+
m.mu.Lock()
364+
defer m.mu.Unlock()
365+
366+
entries, err := os.ReadDir(m.paths.DevicesDir())
367+
if err != nil {
368+
if os.IsNotExist(err) {
369+
// No devices directory yet, nothing to reconcile
370+
return nil
371+
}
372+
return fmt.Errorf("read devices dir: %w", err)
373+
}
374+
375+
var reconciled int
376+
for _, entry := range entries {
377+
if !entry.IsDir() {
378+
continue
379+
}
380+
381+
device, err := m.loadDevice(entry.Name())
382+
if err != nil {
383+
log.WarnContext(ctx, "failed to load device during reconciliation",
384+
"device_id", entry.Name(),
385+
"error", err,
386+
)
387+
continue
388+
}
389+
390+
// Skip devices that aren't attached to anything
391+
if device.AttachedTo == nil {
392+
continue
393+
}
394+
395+
// Check if the referenced instance still exists
396+
instanceID := *device.AttachedTo
397+
instanceDir := m.paths.InstanceDir(instanceID)
398+
if _, err := os.Stat(instanceDir); os.IsNotExist(err) {
399+
// Instance no longer exists - clear the orphaned attachment
400+
log.WarnContext(ctx, "clearing orphaned device attachment",
401+
"device_id", device.Id,
402+
"device_name", device.Name,
403+
"orphaned_instance_id", instanceID,
404+
)
405+
406+
device.AttachedTo = nil
407+
if err := m.saveDevice(device); err != nil {
408+
log.ErrorContext(ctx, "failed to save device after clearing attachment",
409+
"device_id", device.Id,
410+
"error", err,
411+
)
412+
continue
413+
}
414+
reconciled++
415+
}
416+
}
417+
418+
if reconciled > 0 {
419+
log.InfoContext(ctx, "device reconciliation complete",
420+
"orphaned_attachments_cleared", reconciled,
421+
)
422+
} else {
423+
log.InfoContext(ctx, "device reconciliation complete, no orphaned attachments found")
424+
}
425+
426+
return nil
427+
}
428+
346429
// Helper methods
347430

348431
func (m *manager) loadDevice(id string) (*Device, error) {

lib/devices/vfio.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,27 @@ func (v *VFIOBinder) stopNvidiaPersistenced() error {
191191
return fmt.Errorf("failed to stop nvidia-persistenced (try: sudo systemctl stop nvidia-persistenced)")
192192
}
193193

194-
// Give it a moment to exit
195-
time.Sleep(500 * time.Millisecond)
196-
return nil
194+
// Wait for process to exit with polling instead of arbitrary sleep
195+
return v.waitForProcessExit("nvidia-persistenced", 2*time.Second)
196+
}
197+
198+
// waitForProcessExit polls for a process to exit, with timeout
199+
func (v *VFIOBinder) waitForProcessExit(processName string, timeout time.Duration) error {
200+
deadline := time.Now().Add(timeout)
201+
pollInterval := 100 * time.Millisecond
202+
203+
for time.Now().Before(deadline) {
204+
checkCmd := exec.Command("pgrep", processName)
205+
if checkCmd.Run() != nil {
206+
// Process no longer exists
207+
return nil
208+
}
209+
time.Sleep(pollInterval)
210+
}
211+
212+
// Timeout - process still running
213+
slog.Warn("timeout waiting for process to exit", "process", processName, "timeout", timeout)
214+
return nil // Continue anyway, the bind might still work
197215
}
198216

199217
// startNvidiaPersistenced starts the nvidia-persistenced service

0 commit comments

Comments
 (0)