Skip to content

Commit a5d652a

Browse files
committed
Add logging, safer orphan cleanup
1 parent a0e8ec6 commit a5d652a

File tree

4 files changed

+151
-31
lines changed

4 files changed

+151
-31
lines changed

cmd/api/main.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/onkernel/hypeman"
2323
"github.com/onkernel/hypeman/cmd/api/api"
2424
"github.com/onkernel/hypeman/cmd/api/config"
25+
"github.com/onkernel/hypeman/lib/devices"
2526
"github.com/onkernel/hypeman/lib/guest"
2627
"github.com/onkernel/hypeman/lib/hypervisor/qemu"
2728
"github.com/onkernel/hypeman/lib/instances"
@@ -200,6 +201,26 @@ func run() error {
200201
return fmt.Errorf("reconcile device state: %w", err)
201202
}
202203

204+
// Reconcile mdev devices (clears orphaned vGPUs from crashed VMs)
205+
// Build mdev info from instances - only destroys mdevs tracked by hypeman
206+
logger.Info("Reconciling mdev devices...")
207+
var mdevInfos []devices.MdevReconcileInfo
208+
if allInstances != nil {
209+
for _, inst := range allInstances {
210+
if inst.GPUMdevUUID != "" {
211+
mdevInfos = append(mdevInfos, devices.MdevReconcileInfo{
212+
InstanceID: inst.Id,
213+
MdevUUID: inst.GPUMdevUUID,
214+
IsRunning: inst.State == instances.StateRunning || inst.State == instances.StateUnknown,
215+
})
216+
}
217+
}
218+
}
219+
if err := devices.ReconcileMdevs(app.Ctx, mdevInfos); err != nil {
220+
// Log but don't fail - mdev cleanup is best-effort
221+
logger.Warn("failed to reconcile mdev devices", "error", err)
222+
}
223+
203224
// Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams)
204225
logger.Info("Initializing ingress manager...")
205226
if err := app.IngressManager.Initialize(app.Ctx); err != nil {

lib/devices/mdev.go

Lines changed: 127 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package devices
22

33
import (
44
"bufio"
5+
"context"
56
"encoding/json"
67
"fmt"
78
"os"
@@ -10,15 +11,21 @@ import (
1011
"regexp"
1112
"strconv"
1213
"strings"
14+
"sync"
1315

1416
"github.com/google/uuid"
17+
"github.com/onkernel/hypeman/lib/logger"
1518
)
1619

1720
const (
1821
mdevBusPath = "/sys/class/mdev_bus"
1922
mdevDevices = "/sys/bus/mdev/devices"
2023
)
2124

25+
// mdevMu protects mdev creation/destruction to prevent race conditions
26+
// when multiple instances request vGPUs concurrently.
27+
var mdevMu sync.Mutex
28+
2229
// DiscoverVFs returns all SR-IOV Virtual Functions available for vGPU.
2330
// These are discovered by scanning /sys/class/mdev_bus/ which contains
2431
// VFs that can host mdev devices.
@@ -31,6 +38,13 @@ func DiscoverVFs() ([]VirtualFunction, error) {
3138
return nil, fmt.Errorf("read mdev_bus: %w", err)
3239
}
3340

41+
// List mdevs once and build a lookup map to avoid O(n*m) performance
42+
mdevs, _ := ListMdevDevices()
43+
mdevByVF := make(map[string]bool, len(mdevs))
44+
for _, mdev := range mdevs {
45+
mdevByVF[mdev.VFAddress] = true
46+
}
47+
3448
var vfs []VirtualFunction
3549
for _, entry := range entries {
3650
vfAddr := entry.Name()
@@ -43,8 +57,8 @@ func DiscoverVFs() ([]VirtualFunction, error) {
4357
parentGPU = filepath.Base(target)
4458
}
4559

46-
// Check if this VF already has an mdev
47-
hasMdev := vfHasMdev(vfAddr)
60+
// Check if this VF already has an mdev (using pre-built lookup map)
61+
hasMdev := mdevByVF[vfAddr]
4862

4963
vfs = append(vfs, VirtualFunction{
5064
PCIAddress: vfAddr,
@@ -56,18 +70,6 @@ func DiscoverVFs() ([]VirtualFunction, error) {
5670
return vfs, nil
5771
}
5872

59-
// vfHasMdev checks if a VF has an mdev device created on it
60-
func vfHasMdev(vfAddr string) bool {
61-
// Check if any mdev device has this VF as parent
62-
mdevs, _ := ListMdevDevices()
63-
for _, mdev := range mdevs {
64-
if mdev.VFAddress == vfAddr {
65-
return true
66-
}
67-
}
68-
return false
69-
}
70-
7173
// ListGPUProfiles returns available vGPU profiles with availability counts.
7274
// Profiles are discovered from the first VF's mdev_supported_types directory.
7375
func ListGPUProfiles() ([]GPUProfile, error) {
@@ -328,7 +330,15 @@ func getProfileNameFromType(profileType, vfAddress string) string {
328330

329331
// CreateMdev creates an mdev device for the given profile and instance.
330332
// It finds an available VF and creates the mdev, returning the device info.
331-
func CreateMdev(profileName, instanceID string) (*MdevDevice, error) {
333+
// This function is thread-safe and uses a mutex to prevent race conditions
334+
// when multiple instances request vGPUs concurrently.
335+
func CreateMdev(ctx context.Context, profileName, instanceID string) (*MdevDevice, error) {
336+
log := logger.FromContext(ctx)
337+
338+
// Lock to prevent race conditions when multiple instances request the same profile
339+
mdevMu.Lock()
340+
defer mdevMu.Unlock()
341+
332342
// Find profile type from name
333343
profileType, err := findProfileType(profileName)
334344
if err != nil {
@@ -364,12 +374,16 @@ func CreateMdev(profileName, instanceID string) (*MdevDevice, error) {
364374
// Generate UUID for the mdev
365375
mdevUUID := uuid.New().String()
366376

377+
log.DebugContext(ctx, "creating mdev device", "profile", profileName, "vf", targetVF, "uuid", mdevUUID, "instance_id", instanceID)
378+
367379
// Create mdev by writing UUID to create file
368380
createPath := filepath.Join(mdevBusPath, targetVF, "mdev_supported_types", profileType, "create")
369381
if err := os.WriteFile(createPath, []byte(mdevUUID), 0200); err != nil {
370-
return nil, fmt.Errorf("create mdev: %w", err)
382+
return nil, fmt.Errorf("create mdev on VF %s: %w", targetVF, err)
371383
}
372384

385+
log.InfoContext(ctx, "created mdev device", "profile", profileName, "vf", targetVF, "uuid", mdevUUID, "instance_id", instanceID)
386+
373387
return &MdevDevice{
374388
UUID: mdevUUID,
375389
VFAddress: targetVF,
@@ -381,40 +395,125 @@ func CreateMdev(profileName, instanceID string) (*MdevDevice, error) {
381395
}
382396

383397
// DestroyMdev removes an mdev device.
384-
func DestroyMdev(mdevUUID string) error {
398+
func DestroyMdev(ctx context.Context, mdevUUID string) error {
399+
log := logger.FromContext(ctx)
400+
401+
// Lock to prevent race conditions during destruction
402+
mdevMu.Lock()
403+
defer mdevMu.Unlock()
404+
405+
log.DebugContext(ctx, "destroying mdev device", "uuid", mdevUUID)
406+
385407
// Try mdevctl undefine first (removes persistent definition)
386-
exec.Command("mdevctl", "undefine", "--uuid", mdevUUID).Run()
408+
if err := exec.Command("mdevctl", "undefine", "--uuid", mdevUUID).Run(); err != nil {
409+
// Log at debug level - mdevctl might not be installed or mdev might not be defined
410+
log.DebugContext(ctx, "mdevctl undefine failed (may be expected)", "uuid", mdevUUID, "error", err)
411+
}
387412

388413
// Remove via sysfs
389414
removePath := filepath.Join(mdevDevices, mdevUUID, "remove")
390415
if err := os.WriteFile(removePath, []byte("1"), 0200); err != nil {
391416
if os.IsNotExist(err) {
417+
log.DebugContext(ctx, "mdev already removed", "uuid", mdevUUID)
392418
return nil // Already removed
393419
}
394-
return fmt.Errorf("remove mdev: %w", err)
420+
return fmt.Errorf("remove mdev %s: %w", mdevUUID, err)
395421
}
396422

423+
log.InfoContext(ctx, "destroyed mdev device", "uuid", mdevUUID)
397424
return nil
398425
}
399426

400-
// ReconcileMdevs destroys orphaned mdevs that are not attached to running instances.
427+
// IsMdevInUse checks if an mdev device is currently bound to a driver (in use by a VM).
428+
// An mdev with a driver symlink is actively attached to a hypervisor/VFIO.
429+
func IsMdevInUse(mdevUUID string) bool {
430+
driverPath := filepath.Join(mdevDevices, mdevUUID, "driver")
431+
_, err := os.Readlink(driverPath)
432+
return err == nil // Has a driver = in use
433+
}
434+
435+
// MdevReconcileInfo contains information needed to reconcile mdevs for an instance
436+
type MdevReconcileInfo struct {
437+
InstanceID string
438+
MdevUUID string
439+
IsRunning bool // true if instance's VMM is running or state is unknown
440+
}
441+
442+
// ReconcileMdevs destroys orphaned mdevs that belong to hypeman but are no longer in use.
401443
// This is called on server startup to clean up stale mdevs from previous runs.
402-
func ReconcileMdevs(isInstanceRunning func(string) bool) error {
444+
//
445+
// Safety guarantees:
446+
// - Only destroys mdevs that are tracked by hypeman instances (via hypemanMdevs map)
447+
// - Never destroys mdevs created by other processes on the host
448+
// - Skips mdevs that are currently bound to a driver (in use by a VM)
449+
// - Skips mdevs for instances in Running or Unknown state
450+
func ReconcileMdevs(ctx context.Context, instanceInfos []MdevReconcileInfo) error {
451+
log := logger.FromContext(ctx)
452+
403453
mdevs, err := ListMdevDevices()
404454
if err != nil {
405455
return fmt.Errorf("list mdevs: %w", err)
406456
}
407457

458+
if len(mdevs) == 0 {
459+
log.DebugContext(ctx, "no mdev devices found to reconcile")
460+
return nil
461+
}
462+
463+
// Build lookup maps from instance info
464+
// mdevUUID -> instanceID for mdevs managed by hypeman
465+
hypemanMdevs := make(map[string]string, len(instanceInfos))
466+
// instanceID -> isRunning for liveness check
467+
instanceRunning := make(map[string]bool, len(instanceInfos))
468+
for _, info := range instanceInfos {
469+
if info.MdevUUID != "" {
470+
hypemanMdevs[info.MdevUUID] = info.InstanceID
471+
instanceRunning[info.InstanceID] = info.IsRunning
472+
}
473+
}
474+
475+
log.InfoContext(ctx, "reconciling mdev devices", "total_mdevs", len(mdevs), "hypeman_mdevs", len(hypemanMdevs))
476+
477+
var destroyed, skippedNotOurs, skippedInUse, skippedRunning int
408478
for _, mdev := range mdevs {
409-
// If mdev has an instance ID and that instance is not running, destroy it
410-
// If mdev has no instance ID, it's orphaned (created but never tracked)
411-
if mdev.InstanceID == "" || !isInstanceRunning(mdev.InstanceID) {
412-
if err := DestroyMdev(mdev.UUID); err != nil {
413-
// Log but continue - best effort cleanup
414-
continue
415-
}
479+
// Only consider mdevs that hypeman created
480+
instanceID, isOurs := hypemanMdevs[mdev.UUID]
481+
if !isOurs {
482+
log.DebugContext(ctx, "skipping mdev not managed by hypeman", "uuid", mdev.UUID, "profile", mdev.ProfileName)
483+
skippedNotOurs++
484+
continue
416485
}
486+
487+
// Skip if instance is running or in unknown state (might still be using the mdev)
488+
if instanceRunning[instanceID] {
489+
log.DebugContext(ctx, "skipping mdev for running/unknown instance", "uuid", mdev.UUID, "instance_id", instanceID)
490+
skippedRunning++
491+
continue
492+
}
493+
494+
// Check if mdev is bound to a driver (in use by VM)
495+
if IsMdevInUse(mdev.UUID) {
496+
log.WarnContext(ctx, "skipping mdev still bound to driver", "uuid", mdev.UUID, "instance_id", instanceID)
497+
skippedInUse++
498+
continue
499+
}
500+
501+
// Safe to destroy - it's ours, instance is not running, and not bound to driver
502+
log.InfoContext(ctx, "destroying orphaned mdev", "uuid", mdev.UUID, "profile", mdev.ProfileName, "instance_id", instanceID)
503+
if err := DestroyMdev(ctx, mdev.UUID); err != nil {
504+
// Log error but continue - best effort cleanup
505+
log.WarnContext(ctx, "failed to destroy orphaned mdev", "uuid", mdev.UUID, "error", err)
506+
continue
507+
}
508+
destroyed++
417509
}
418510

511+
log.InfoContext(ctx, "mdev reconciliation complete",
512+
"destroyed", destroyed,
513+
"skipped_not_ours", skippedNotOurs,
514+
"skipped_in_use", skippedInUse,
515+
"skipped_running", skippedRunning,
516+
)
517+
419518
return nil
420519
}

lib/instances/create.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ func (m *manager) createInstance(
260260
// Handle vGPU profile request - create mdev device
261261
if req.GPU != nil && req.GPU.Profile != "" {
262262
log.InfoContext(ctx, "creating vGPU mdev", "instance_id", id, "profile", req.GPU.Profile)
263-
mdev, err := devices.CreateMdev(req.GPU.Profile, id)
263+
mdev, err := devices.CreateMdev(ctx, req.GPU.Profile, id)
264264
if err != nil {
265265
log.ErrorContext(ctx, "failed to create mdev", "profile", req.GPU.Profile, "error", err)
266266
return nil, fmt.Errorf("create vGPU mdev for profile %s: %w", req.GPU.Profile, err)
@@ -272,7 +272,7 @@ func (m *manager) createInstance(
272272
// Add mdev cleanup to stack
273273
cu.Add(func() {
274274
log.DebugContext(ctx, "destroying mdev on cleanup", "instance_id", id, "uuid", gpuMdevUUID)
275-
devices.DestroyMdev(gpuMdevUUID)
275+
devices.DestroyMdev(ctx, gpuMdevUUID)
276276
})
277277
}
278278

lib/instances/delete.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func (m *manager) deleteInstance(
9898
// 6c. Destroy vGPU mdev device if present
9999
if inst.GPUMdevUUID != "" {
100100
log.InfoContext(ctx, "destroying vGPU mdev", "instance_id", id, "uuid", inst.GPUMdevUUID)
101-
if err := devices.DestroyMdev(inst.GPUMdevUUID); err != nil {
101+
if err := devices.DestroyMdev(ctx, inst.GPUMdevUUID); err != nil {
102102
// Log error but continue with cleanup
103103
log.WarnContext(ctx, "failed to destroy mdev, continuing with cleanup", "instance_id", id, "uuid", inst.GPUMdevUUID, "error", err)
104104
}

0 commit comments

Comments
 (0)